Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 59 additions & 39 deletions stanza/utils/datasets/ner/convert_he_iahlt.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,73 +6,93 @@
import stanza.utils.default_paths as default_paths
from stanza.utils.datasets.ner.utils import write_dataset

_RE_ENTITY_SPLIT = re.compile(r"([()])")

def output_entities(sentence):
# Build a single print call for all entities in the sentence for faster output
entities = []
for word in sentence.words:
misc = word.misc
if misc is None:
continue

pieces = misc.split("|")
for piece in pieces:
# Avoid repeated split if there's only one piece
if "Entity=" not in misc:
continue
for piece in misc.split("|"):
if piece.startswith("Entity="):
entity = piece.split("=", maxsplit=1)[1]
print(" " + entity)
# Use partition for more efficient single split
_, _, entity = piece.partition("=")
entities.append(" " + entity)
break
if entities:
print("\n".join(entities))

def extract_single_sentence(sentence):
current_entity = []
words = []
append_word = words.append # Localize method for speed

for word in sentence.words:
text = word.text
misc = word.misc
if misc is None:
pieces = []
else:
pieces = misc.split("|")
pieces = misc.split("|") if misc else []

closes = []
first_entity = False

# Fast path for common case of no entities
for piece in pieces:
if piece.startswith("Entity="):
entity = piece.split("=", maxsplit=1)[1]
entity_pieces = re.split(r"([()])", entity)
entity_pieces = [x for x in entity_pieces if x] # remove blanks from re.split
entity_idx = 0
while entity_idx < len(entity_pieces):
if entity_pieces[entity_idx] == '(':
assert len(entity_pieces) > entity_idx + 1, "Opening an unspecified entity"
if len(current_entity) == 0:
first_entity = True
current_entity.append(entity_pieces[entity_idx + 1])
entity_idx += 2
elif entity_pieces[entity_idx] == ')':
assert entity_idx != 0, "Closing an unspecified entity"
closes.append(entity_pieces[entity_idx-1])
entity_idx += 1
else:
# the entities themselves get added or removed via the ()
entity_idx += 1

if len(current_entity) == 0:
entity = 'O'
# Use the faster in-string check to avoid startswith unless needed
if not piece or piece[0] != 'E' or not piece.startswith("Entity="):
continue
# Use partition rather than split for a slight speed gain
_, _, entity = piece.partition("=")
# Avoid repeated regex compilation with the precompiled one
entity_pieces = [x for x in _RE_ENTITY_SPLIT.split(entity) if x] # remove blanks
entity_idx = 0
entity_pieces_len = len(entity_pieces)
while entity_idx < entity_pieces_len:
piece_value = entity_pieces[entity_idx]
if piece_value == '(':
# Combine assertion into conditional for a very modest gain
if entity_idx + 1 >= entity_pieces_len:
raise AssertionError("Opening an unspecified entity")
if not current_entity:
first_entity = True
current_entity.append(entity_pieces[entity_idx + 1])
entity_idx += 2
elif piece_value == ')':
if entity_idx == 0:
raise AssertionError("Closing an unspecified entity")
closes.append(entity_pieces[entity_idx - 1])
entity_idx += 1
else:
entity_idx += 1

# Avoid unnecessary list operation if current_entity is empty
if not current_entity:
entity_tag = 'O'
else:
entity = current_entity[0]
entity = "B-" + entity if first_entity else "I-" + entity
words.append((text, entity))
entity_tag = current_entity[0]
entity_tag = "B-" + entity_tag if first_entity else "I-" + entity_tag
append_word((text, entity_tag))

assert len(current_entity) >= len(closes), "Too many closes for the current open entities"
if len(current_entity) < len(closes):
raise AssertionError("Too many closes for the current open entities")
for close_entity in closes:
# TODO: check the close is closing the right thing
assert close_entity == current_entity[-1], "Closed the wrong entity: %s vs %s" % (close_entity, current_entity[-1])
current_entity = current_entity[:-1]
last = current_entity[-1]
if close_entity != last:
raise AssertionError("Closed the wrong entity: %s vs %s" % (close_entity, last))
current_entity.pop()
return words

def extract_sentences(doc):
sentences = []
append_sentence = sentences.append # Localize append for small speedup
for sentence in doc.sentences:
try:
words = extract_single_sentence(sentence)
sentences.append(words)
append_sentence(words)
except AssertionError as e:
print("Skipping sentence %s ... %s" % (sentence.sent_id, str(e)))
output_entities(sentence)
Expand Down