|
36 | 36 | VERBOSE = False |
37 | 37 | STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5 |
38 | 38 | WORD_PATTERN = re.compile('\[([^\]]+)\]') |
39 | | -CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"") |
| 39 | +CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)[\]\)]\) -> \((\d*),(\d)*,\[(\d*),(\d*)[\]\)]\), that is: \"(.*)\" -> \"(.*)\"") |
40 | 40 |
|
41 | | -DIRECTORY = "stanford-corenlp-full-2013-06-20" |
| 41 | +DIRECTORY = "stanford-corenlp-full-2014-01-04" |
42 | 42 |
|
43 | 43 |
|
44 | 44 | class bc: |
@@ -84,8 +84,8 @@ def init_corenlp_command(corenlp_path, memory, properties): |
84 | 84 | """ |
85 | 85 |
|
86 | 86 | # TODO: Can edit jar constants |
87 | | - jars = ["stanford-corenlp-3.2.0.jar", |
88 | | - "stanford-corenlp-3.2.0-models.jar", |
| 87 | + jars = ["stanford-corenlp-3.3.1.jar", |
| 88 | + "stanford-corenlp-3.3.1-models.jar", |
89 | 89 | "xom.jar", |
90 | 90 | "joda-time.jar", |
91 | 91 | "jollyday.jar"] |
@@ -152,7 +152,8 @@ def parse_parser_results(text): |
152 | 152 | """ |
153 | 153 | results = {"sentences": []} |
154 | 154 | state = STATE_START |
155 | | - for line in text.split("\n"): |
| 155 | + lines = text.split("\n") |
| 156 | + for index, line in enumerate(lines): |
156 | 157 | line = line.strip() |
157 | 158 |
|
158 | 159 | if line.startswith("Sentence #"): |
@@ -188,16 +189,16 @@ def parse_parser_results(text): |
188 | 189 | sentence['dependencies'].append(tuple([rel, left, right])) |
189 | 190 |
|
190 | 191 | elif state == STATE_COREFERENCE: |
191 | | - if "Coreference set" in line: |
| 192 | + if "Coreference set" in lines[index-1]: |
| 193 | + coref_set = [] |
192 | 194 | if 'coref' not in results: |
193 | 195 | results['coref'] = [] |
194 | | - coref_set = [] |
195 | | - results['coref'].append(coref_set) |
196 | | - else: |
| 196 | + |
197 | 197 | for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line): |
198 | 198 | src_i, src_pos, src_l, src_r = int(src_i) - 1, int(src_pos) - 1, int(src_l) - 1, int(src_r) - 1 |
199 | 199 | sink_i, sink_pos, sink_l, sink_r = int(sink_i) - 1, int(sink_pos) - 1, int(sink_l) - 1, int(sink_r) - 1 |
200 | | - coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) |
| 200 | + coref_set.extend(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) |
| 201 | + results['coref'].extend(coref_set) |
201 | 202 |
|
202 | 203 | return results |
203 | 204 |
|
|
0 commit comments