2121
2222import json , optparse , os , re , sys , time , traceback
2323import pexpect
24+ import tempfile
25+ import shutil
2426from progressbar import ProgressBar , Fraction
2527from unidecode import unidecode
2628from jsonrpclib .SimpleJSONRPCServer import SimpleJSONRPCServer
3032WORD_PATTERN = re .compile ('\[([^\]]+)\]' )
3133CR_PATTERN = re .compile (r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"" )
3234
35+ class bc :
36+ HEADER = '\033 [95m'
37+ OKBLUE = '\033 [94m'
38+ OKGREEN = '\033 [92m'
39+ WARNING = '\033 [93m'
40+ FAIL = '\033 [91m'
41+ ENDC = '\033 [0m'
42+
43+
3344class ProcessError (Exception ):
3445 def __init__ (self , value ):
3546 self .value = value
@@ -49,6 +60,48 @@ def __str__(self):
4960 return repr (self .value )
5061
5162
63+ def init_corenlp_command (corenlp_path , memory ):
64+ """
65+ Checks the location of the jar files.
66+ Spawns the server as a process.
67+ """
68+
69+
70+ # TODO: Can edit jar constants
71+ jars = ["stanford-corenlp-1.3.5.jar" ,
72+ "stanford-corenlp-1.3.5-models.jar" ,
73+ "xom.jar" ,
74+ "joda-time.jar" ,
75+ "jollyday.jar" ]
76+
77+ java_path = "java"
78+ classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
79+ # include the properties file, so you can change defaults
80+ # but any changes in output format will break parse_parser_results()
81+ property_name = "default.properties"
82+ current_dir_pr = os .path .dirname (os .path .abspath ( __file__ )) + "/" + property_name
83+ if os .path .exists (property_name ):
84+ props = "-props %s" % (property_name )
85+ elif os .path .exists (current_dir_pr ):
86+ props = "-props %s" % (current_dir_pr )
87+ else :
88+ raise Exception ("Error! Cannot locate: default.properties" )
89+
90+ # add and check classpaths
91+ jars = [corenlp_path + "/" + jar for jar in jars ]
92+ for jar in jars :
93+ if not os .path .exists (jar ):
94+ raise Exception ("Error! Cannot locate: %s" % jar )
95+
96+ # add memory limit on JVM
97+ if memory :
98+ limit = "-Xmx%s" % memory
99+ else :
100+ limit = ""
101+
102+ return "%s %s -cp %s %s %s" % (java_path , limit , ':' .join (jars ), classname , props )
103+
104+
52105def remove_id (word ):
53106 """Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """
54107 return word .count ("-" ) == 0 and word or word [0 :word .rindex ("-" )]
@@ -133,8 +186,105 @@ def parse_parser_results(text):
133186
134187 return results
135188
189+ def parse_parser_xml_results (xml ):
190+ import xmltodict
191+ from collections import OrderedDict
192+
193+ def extract_words_from_xml (sent_node ):
194+ exted = map (lambda x : x ['word' ], sent_node ['tokens' ]['token' ])
195+ return exted
196+
197+ #turning the raw xml into a raw python dictionary:
198+ raw_dict = xmltodict .parse (xml )
199+
200+ #making a raw sentence list of dictionaries:
201+ raw_sent_list = raw_dict [u'root' ][u'document' ][u'sentences' ][u'sentence' ]
202+ #making a raw coref dictionary:
203+ raw_coref_list = raw_dict [u'root' ][u'document' ][u'coreference' ][u'coreference' ]
204+
205+ #cleaning up the list ...the problem is that this doesn't come in pairs, as the command line version:
206+
207+ # To dicrease is for given index different from list index
208+ coref_index = [[[eval (raw_coref_list [j ][u'mention' ][i ]['sentence' ])- 1 ,
209+ eval (raw_coref_list [j ][u'mention' ][i ]['head' ])- 1 ,
210+ eval (raw_coref_list [j ][u'mention' ][i ]['start' ])- 1 ,
211+ eval (raw_coref_list [j ][u'mention' ][i ]['end' ])- 1 ]
212+ for i in xrange (len (raw_coref_list [j ][u'mention' ]))]
213+ for j in xrange (len (raw_coref_list ))]
214+
215+ coref_list = []
216+ for j in xrange (len (coref_index )):
217+ coref_list .append (coref_index [j ])
218+ for k , coref in enumerate (coref_index [j ]):
219+ exted = raw_sent_list [coref [0 ]]['tokens' ]['token' ][coref [2 ]:coref [3 ]]
220+ exted_words = map (lambda x : x ['word' ], exted )
221+ coref_list [j ][k ].insert (0 , ' ' .join (exted_words ))
222+
223+ coref_list = [[[coref_list [j ][i ], coref_list [j ][0 ]]
224+ for i in xrange (len (coref_list [j ])) if i != 0 ]
225+ for j in xrange (len (coref_list ))]
226+
227+ sentences = [{'dependencies' : [[dep ['dep' ][i ]['@type' ],
228+ dep ['dep' ][i ]['governor' ]['#text' ],
229+ dep ['dep' ][i ]['dependent' ]['#text' ]]
230+ for dep in raw_sent_list [j ][u'dependencies' ]
231+ for i in xrange (len (dep ['dep' ]))
232+ if dep ['@type' ]== 'basic-dependencies' ],
233+ 'text' : extract_words_from_xml (raw_sent_list [j ]),
234+ 'parsetree' : str (raw_sent_list [j ]['parse' ]),
235+ 'words' : [[str (token ['word' ]), OrderedDict ([
236+ ('NamedEntityTag' , str (token ['NER' ])),
237+ ('CharacterOffsetEnd' , str (token ['CharacterOffsetEnd' ])),
238+ ('CharacterOffsetBegin' , str (token ['CharacterOffsetBegin' ])),
239+ ('PartOfSpeech' , str (token ['POS' ])),
240+ ('Lemma' , str (token ['lemma' ]))])]
241+ for token in raw_sent_list [j ]['tokens' ][u'token' ]]}
242+
243+ for j in xrange (len (raw_sent_list ))]
244+
245+ results = {'coref' :coref_list , 'sentences' :sentences }
246+
247+ return results
248+
249+ def parse_xml_output (input_dir , corenlp_path = "stanford-corenlp-full-2013-04-04/" , memory = "3g" ):
250+ """Because interaction with the command-line interface of the CoreNLP
251+ tools is limited to very short text bits, it is necessary to parse xml
252+ output"""
253+ #First, we change to the directory where we place the xml files from the
254+ #parser:
255+
256+ xml_dir = tempfile .mkdtemp ()
257+ file_list = tempfile .NamedTemporaryFile ()
258+
259+ #we get a list of the cleaned files that we want to parse:
260+
261+ files = [input_dir + '/' + f for f in os .listdir (input_dir )]
262+
263+ #creating the file list of files to parse
264+
265+ file_list .write ('\n ' .join (files ))
266+ file_list .seek (0 )
136267
137- class StanfordCoreNLP (object ):
268+ command = init_corenlp_command (corenlp_path , memory )\
269+ + ' -filelist %s -outputDirectory %s' % (file_list .name , xml_dir )
270+
271+ #creates the xml file of parser output:
272+
273+ os .system (command )
274+
275+ #reading in the raw xml file:
276+ try :
277+ for output_file in os .listdir (xml_dir ):
278+ with open (xml_dir + '/' + output_file , 'r' ) as xml :
279+ parsed = xml .read ()
280+ yield parse_parser_xml_results (parsed )
281+ finally :
282+ file_list .close ()
283+ try :
284+ shutil .rmtree (xml_dir )
285+ except : pass
286+
287+ class StanfordCoreNLP :
138288 """
139289 Command-line interaction with Stanford's CoreNLP java utilities.
140290 Can be run as a JSON-RPC server or imported as a module.
@@ -145,44 +295,8 @@ def __init__(self, corenlp_path="stanford-corenlp-full-2013-04-04/", memory="3g"
145295 Spawns the server as a process.
146296 """
147297
148- # TODO: Can edit jar constants
149- # jars = ["stanford-corenlp-1.3.5.jar",
150- # "stanford-corenlp-1.3.5-models.jar",
151- # "joda-time.jar",
152- # "xom.jar"]
153- jars = ["stanford-corenlp-1.3.5.jar" ,
154- "stanford-corenlp-1.3.5-models.jar" ,
155- "xom.jar" ,
156- "joda-time.jar" ,
157- "jollyday.jar" ]
158-
159- java_path = "java"
160- classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
161- # include the properties file, so you can change defaults
162- # but any changes in output format will break parse_parser_results()
163- property_name = "default.properties"
164- current_dir_pr = os .path .dirname (os .path .abspath ( __file__ )) + "/" + property_name
165- if os .path .exists (property_name ):
166- props = "-props %s" % (property_name )
167- elif os .path .exists (current_dir_pr ):
168- props = "-props %s" % (current_dir_pr )
169- else :
170- raise Exception ("Error! Cannot locate: default.properties" )
171-
172- # add and check classpaths
173- jars = [corenlp_path + "/" + jar for jar in jars ]
174- for jar in jars :
175- if not os .path .exists (jar ):
176- raise Exception ("Error! Cannot locate: %s" % jar )
177-
178- # add memory limit on JVM
179- if memory :
180- limit = "-Xmx%s" % memory
181- else :
182- limit = ""
183-
184298 # spawn the server
185- start_corenlp = "%s %s -cp %s %s %s" % ( java_path , limit , ':' . join ( jars ), classname , props )
299+ start_corenlp = init_corenlp_command ( corenlp_path , memory )
186300 if VERBOSE : print start_corenlp
187301 self .corenlp = pexpect .spawn (start_corenlp )
188302
@@ -290,7 +404,22 @@ def parse(self, text):
290404 reads in the result, parses the results and returns a list
291405 with one dictionary entry for each parsed sentence, in JSON format.
292406 """
293- return json .dumps (self ._parse (text ))
407+ return json .dumps (self .raw_parse (text ))
408+
409+
410+ def batch_parse (input_folder , corenlp_path = "stanford-corenlp-full-2013-04-04/" , memory = "3g" ):
411+ """
412+ This function takes input files,
413+ sends list of input files to the Stanford parser,
414+ reads in the results from temporary folder in your OS and
415+ returns a generator object of list that consist of dictionary entry.
416+ ( The function needs xmltodict,
417+ and doesn't need init 'StanfordCoreNLP' class. )
418+ """
419+ if not os .path .exists (input_folder ):
420+ raise Exception ("Not exist input_folder" )
421+
422+ return parse_xml_output (input_folder , corenlp_path , memory )
294423
295424
296425if __name__ == '__main__' :
@@ -305,6 +434,8 @@ def parse(self, text):
305434 help = 'Host to serve on (default localhost; 0.0.0.0 to make public)' )
306435 parser .add_option ('-S' , '--corenlp' , default = "stanford-corenlp-full-2013-04-04" ,
307436 help = 'Stanford CoreNLP tool directory (default stanford-corenlp-full-2013-04-04/)' )
437+ parser .add_option ('-x' , '--xml' , action = "store_true" ,
438+ help = "Using XML format for read CoreNLP outputs (default false, but the option will be true on the future)" )
308439 options , args = parser .parse_args ()
309440 # server = jsonrpc.Server(jsonrpc.JsonRpc20(),
310441 # jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
0 commit comments