Skip to content

Commit 9a5b7b1

Browse files
committed
Fixes coreference results
- updated the CR_PATTERN regex to correctly extract the coreference set notation - cleaned up the code in the STATE_COREFERENCE if/else branch in parse_parser_results to be clearer and actually add the coreference results to the result dict.
1 parent 73e6b1d commit 9a5b7b1

File tree

1 file changed

+11
-10
lines changed

1 file changed

+11
-10
lines changed

corenlp/corenlp.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@
3636
VERBOSE = False
3737
STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
3838
WORD_PATTERN = re.compile('\[([^\]]+)\]')
39-
CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"")
39+
CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)[\]\)]\) -> \((\d*),(\d)*,\[(\d*),(\d*)[\]\)]\), that is: \"(.*)\" -> \"(.*)\"")
4040

41-
DIRECTORY = "stanford-corenlp-full-2013-06-20"
41+
DIRECTORY = "stanford-corenlp-full-2014-01-04"
4242

4343

4444
class bc:
@@ -84,8 +84,8 @@ def init_corenlp_command(corenlp_path, memory, properties):
8484
"""
8585

8686
# TODO: Can edit jar constants
87-
jars = ["stanford-corenlp-3.2.0.jar",
88-
"stanford-corenlp-3.2.0-models.jar",
87+
jars = ["stanford-corenlp-3.3.1.jar",
88+
"stanford-corenlp-3.3.1-models.jar",
8989
"xom.jar",
9090
"joda-time.jar",
9191
"jollyday.jar"]
@@ -152,7 +152,8 @@ def parse_parser_results(text):
152152
"""
153153
results = {"sentences": []}
154154
state = STATE_START
155-
for line in text.split("\n"):
155+
lines = text.split("\n")
156+
for index, line in enumerate(lines):
156157
line = line.strip()
157158

158159
if line.startswith("Sentence #"):
@@ -188,16 +189,16 @@ def parse_parser_results(text):
188189
sentence['dependencies'].append(tuple([rel, left, right]))
189190

190191
elif state == STATE_COREFERENCE:
191-
if "Coreference set" in line:
192+
if "Coreference set" in lines[index-1]:
193+
coref_set = []
192194
if 'coref' not in results:
193195
results['coref'] = []
194-
coref_set = []
195-
results['coref'].append(coref_set)
196-
else:
196+
197197
for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line):
198198
src_i, src_pos, src_l, src_r = int(src_i) - 1, int(src_pos) - 1, int(src_l) - 1, int(src_r) - 1
199199
sink_i, sink_pos, sink_l, sink_r = int(sink_i) - 1, int(sink_pos) - 1, int(sink_l) - 1, int(sink_r) - 1
200-
coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
200+
coref_set.extend(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
201+
results['coref'].extend(coref_set)
201202

202203
return results
203204

0 commit comments

Comments
 (0)