Fixes coreference results

dcalacci · dcalacci · commit 9a5b7b16f04d · 2014-01-22T12:52:03.000-05:00
- updated the CR_PATTERN regex to correctly extract the coreference set
   notation

 - cleaned up the code in the STATE_COREFERENCE if/else branch in
   parse_parser_results to be clearer and actually add the coreference
   results to the result dict.
diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py
@@ -36,9 +36,9 @@
 VERBOSE = False
 STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
 WORD_PATTERN = re.compile('\[([^\]]+)\]')
-CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"")
+CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)[\]\)]\) -> \((\d*),(\d)*,\[(\d*),(\d*)[\]\)]\), that is: \"(.*)\" -> \"(.*)\"")
 
-DIRECTORY = "stanford-corenlp-full-2013-06-20"
+DIRECTORY = "stanford-corenlp-full-2014-01-04"
 
 
 class bc:
@@ -84,8 +84,8 @@ def init_corenlp_command(corenlp_path, memory, properties):
     """
 
     # TODO: Can edit jar constants
-    jars = ["stanford-corenlp-3.2.0.jar",
-            "stanford-corenlp-3.2.0-models.jar",
+    jars = ["stanford-corenlp-3.3.1.jar",
+            "stanford-corenlp-3.3.1-models.jar",
             "xom.jar",
             "joda-time.jar",
             "jollyday.jar"]
@@ -152,7 +152,8 @@ def parse_parser_results(text):
     """
     results = {"sentences": []}
     state = STATE_START
-    for line in text.split("\n"):
+    lines = text.split("\n")
+    for index, line in enumerate(lines):
         line = line.strip()
 
         if line.startswith("Sentence #"):
@@ -188,16 +189,16 @@ def parse_parser_results(text):
                     sentence['dependencies'].append(tuple([rel, left, right]))
 
         elif state == STATE_COREFERENCE:
-            if "Coreference set" in line:
+            if "Coreference set" in lines[index-1]:
+                coref_set = []
                 if 'coref' not in results:
                     results['coref'] = []
-                coref_set = []
-                results['coref'].append(coref_set)
-            else:
+
                 for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line):
                     src_i, src_pos, src_l, src_r = int(src_i) - 1, int(src_pos) - 1, int(src_l) - 1, int(src_r) - 1
                     sink_i, sink_pos, sink_l, sink_r = int(sink_i) - 1, int(sink_pos) - 1, int(sink_l) - 1, int(sink_r) - 1
-                    coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
+                    coref_set.extend(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
+                results['coref'].extend(coref_set)
 
     return results