From 9a5b7b16f04d52adb0b04fb3490472305efcb7d5 Mon Sep 17 00:00:00 2001 From: Dan Calacci Date: Wed, 22 Jan 2014 12:52:03 -0500 Subject: [PATCH 1/2] Fixes coreference results - updated the CR_PATTERN regex to correctly extract the coreference set notation - cleaned up the code in the STATE_COREFERENCE if/else branch in parse_parser_results to be clearer and actually add the coreference results to the result dict. --- corenlp/corenlp.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index 765db6d..d11545f 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -36,9 +36,9 @@ VERBOSE = False STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5 WORD_PATTERN = re.compile('\[([^\]]+)\]') -CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"") +CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)[\]\)]\) -> \((\d*),(\d)*,\[(\d*),(\d*)[\]\)]\), that is: \"(.*)\" -> \"(.*)\"") -DIRECTORY = "stanford-corenlp-full-2013-06-20" +DIRECTORY = "stanford-corenlp-full-2014-01-04" class bc: @@ -84,8 +84,8 @@ def init_corenlp_command(corenlp_path, memory, properties): """ # TODO: Can edit jar constants - jars = ["stanford-corenlp-3.2.0.jar", - "stanford-corenlp-3.2.0-models.jar", + jars = ["stanford-corenlp-3.3.1.jar", + "stanford-corenlp-3.3.1-models.jar", "xom.jar", "joda-time.jar", "jollyday.jar"] @@ -152,7 +152,8 @@ def parse_parser_results(text): """ results = {"sentences": []} state = STATE_START - for line in text.split("\n"): + lines = text.split("\n") + for index, line in enumerate(lines): line = line.strip() if line.startswith("Sentence #"): @@ -188,16 +189,16 @@ def parse_parser_results(text): sentence['dependencies'].append(tuple([rel, left, right])) elif state == STATE_COREFERENCE: - if "Coreference set" in line: + if "Coreference set" in lines[index-1]: + coref_set = [] if 'coref' not in results: results['coref'] = [] - coref_set = [] - results['coref'].append(coref_set) - else: + for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line): src_i, src_pos, src_l, src_r = int(src_i) - 1, int(src_pos) - 1, int(src_l) - 1, int(src_r) - 1 sink_i, sink_pos, sink_l, sink_r = int(sink_i) - 1, int(sink_pos) - 1, int(sink_l) - 1, int(sink_r) - 1 - coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) + coref_set.extend(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) + results['coref'].extend(coref_set) return results From 860481217dd83b08aef35748985c85c2f4cda834 Mon Sep 17 00:00:00 2001 From: Dan Calacci Date: Wed, 22 Jan 2014 16:28:32 -0500 Subject: [PATCH 2/2] fixes grouping of coreferences fixed the terrible bug I left in my previous commit --- corenlp/corenlp.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py index d11545f..a9c2bc2 100755 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -189,17 +189,16 @@ def parse_parser_results(text): sentence['dependencies'].append(tuple([rel, left, right])) elif state == STATE_COREFERENCE: - if "Coreference set" in lines[index-1]: - coref_set = [] + if 'Coreference set' in line: if 'coref' not in results: results['coref'] = [] - + coref_set = [] + results['coref'].append(coref_set) + else: for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line): src_i, src_pos, src_l, src_r = int(src_i) - 1, int(src_pos) - 1, int(src_l) - 1, int(src_r) - 1 sink_i, sink_pos, sink_l, sink_r = int(sink_i) - 1, int(sink_pos) - 1, int(sink_l) - 1, int(sink_r) - 1 - coref_set.extend(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) - results['coref'].extend(coref_set) - + coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) return results