Skip to content

Commit df2f2ed

Browse files
committed
Resolve #1 issue
1 parent c4ed078 commit df2f2ed

File tree

3 files changed

+178
-40
lines changed

3 files changed

+178
-40
lines changed

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# A Python wrapper of Stanford Core NLP tools
2-
2+
---------------------------
33

44
This is a fork of [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python).
55

@@ -132,6 +132,10 @@ tar xvfz WNprolog-3.0.tar.gz
132132
133133
-->
134134

135+
## Developer
136+
* Hiroyoshi Komatsu <hiroyoshi.komat at gmail.com>
137+
* Johannes Castner
138+
135139

136140
Following original README in stanford-corenlp-python.
137141

corenlp/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,7 @@
99
__author__ = 'Hiroyoshi Komatsu'
1010
__license__ = 'GNU v2+'
1111

12+
# classes
1213
from corenlp import StanfordCoreNLP, ParserError, TimeoutError, ProcessError
14+
# functions
15+
from corenlp import batch_parse, parse_parser_xml_results

corenlp/corenlp.py

Lines changed: 170 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
import json, optparse, os, re, sys, time, traceback
2323
import pexpect
24+
import tempfile
25+
import shutil
2426
from progressbar import ProgressBar, Fraction
2527
from unidecode import unidecode
2628
from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer
@@ -30,6 +32,15 @@
3032
WORD_PATTERN = re.compile('\[([^\]]+)\]')
3133
CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"")
3234

35+
class bc:
36+
HEADER = '\033[95m'
37+
OKBLUE = '\033[94m'
38+
OKGREEN = '\033[92m'
39+
WARNING = '\033[93m'
40+
FAIL = '\033[91m'
41+
ENDC = '\033[0m'
42+
43+
3344
class ProcessError(Exception):
3445
def __init__(self, value):
3546
self.value = value
@@ -49,6 +60,48 @@ def __str__(self):
4960
return repr(self.value)
5061

5162

63+
def init_corenlp_command(corenlp_path, memory):
64+
"""
65+
Checks the location of the jar files.
66+
Spawns the server as a process.
67+
"""
68+
69+
70+
# TODO: Can edit jar constants
71+
jars = ["stanford-corenlp-1.3.5.jar",
72+
"stanford-corenlp-1.3.5-models.jar",
73+
"xom.jar",
74+
"joda-time.jar",
75+
"jollyday.jar"]
76+
77+
java_path = "java"
78+
classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
79+
# include the properties file, so you can change defaults
80+
# but any changes in output format will break parse_parser_results()
81+
property_name = "default.properties"
82+
current_dir_pr = os.path.dirname(os.path.abspath( __file__ )) +"/"+ property_name
83+
if os.path.exists(property_name):
84+
props = "-props %s" % (property_name)
85+
elif os.path.exists(current_dir_pr):
86+
props = "-props %s" % (current_dir_pr)
87+
else:
88+
raise Exception("Error! Cannot locate: default.properties")
89+
90+
# add and check classpaths
91+
jars = [corenlp_path +"/"+ jar for jar in jars]
92+
for jar in jars:
93+
if not os.path.exists(jar):
94+
raise Exception("Error! Cannot locate: %s" % jar)
95+
96+
# add memory limit on JVM
97+
if memory:
98+
limit = "-Xmx%s" % memory
99+
else:
100+
limit = ""
101+
102+
return "%s %s -cp %s %s %s" % (java_path, limit, ':'.join(jars), classname, props)
103+
104+
52105
def remove_id(word):
53106
"""Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """
54107
return word.count("-") == 0 and word or word[0:word.rindex("-")]
@@ -133,8 +186,105 @@ def parse_parser_results(text):
133186

134187
return results
135188

189+
def parse_parser_xml_results(xml):
190+
import xmltodict
191+
from collections import OrderedDict
192+
193+
def extract_words_from_xml(sent_node):
194+
exted = map(lambda x: x['word'], sent_node['tokens']['token'])
195+
return exted
196+
197+
#turning the raw xml into a raw python dictionary:
198+
raw_dict = xmltodict.parse(xml)
199+
200+
#making a raw sentence list of dictionaries:
201+
raw_sent_list = raw_dict[u'root'][u'document'][u'sentences'][u'sentence']
202+
#making a raw coref dictionary:
203+
raw_coref_list = raw_dict[u'root'][u'document'][u'coreference'][u'coreference']
204+
205+
#cleaning up the list ...the problem is that this doesn't come in pairs, as the command line version:
206+
207+
# To dicrease is for given index different from list index
208+
coref_index = [[[eval(raw_coref_list[j][u'mention'][i]['sentence'])-1,
209+
eval(raw_coref_list[j][u'mention'][i]['head'])-1,
210+
eval(raw_coref_list[j][u'mention'][i]['start'])-1,
211+
eval(raw_coref_list[j][u'mention'][i]['end'])-1]
212+
for i in xrange(len(raw_coref_list[j][u'mention']))]
213+
for j in xrange(len(raw_coref_list))]
214+
215+
coref_list = []
216+
for j in xrange(len(coref_index)):
217+
coref_list.append(coref_index[j])
218+
for k, coref in enumerate(coref_index[j]):
219+
exted = raw_sent_list[coref[0]]['tokens']['token'][coref[2]:coref[3]]
220+
exted_words = map(lambda x: x['word'], exted)
221+
coref_list[j][k].insert(0, ' '.join(exted_words))
222+
223+
coref_list = [[[coref_list[j][i], coref_list[j][0]]
224+
for i in xrange(len(coref_list[j])) if i != 0]
225+
for j in xrange(len(coref_list))]
226+
227+
sentences = [{'dependencies': [[dep['dep'][i]['@type'],
228+
dep['dep'][i]['governor']['#text'],
229+
dep['dep'][i]['dependent']['#text']]
230+
for dep in raw_sent_list[j][u'dependencies']
231+
for i in xrange(len(dep['dep']))
232+
if dep['@type']=='basic-dependencies'],
233+
'text': extract_words_from_xml(raw_sent_list[j]),
234+
'parsetree': str(raw_sent_list[j]['parse']),
235+
'words': [[str(token['word']), OrderedDict([
236+
('NamedEntityTag', str(token['NER'])),
237+
('CharacterOffsetEnd', str(token['CharacterOffsetEnd'])),
238+
('CharacterOffsetBegin', str(token['CharacterOffsetBegin'])),
239+
('PartOfSpeech', str(token['POS'])),
240+
('Lemma', str(token['lemma']))])]
241+
for token in raw_sent_list[j]['tokens'][u'token']]}
242+
243+
for j in xrange(len(raw_sent_list))]
244+
245+
results = {'coref':coref_list, 'sentences':sentences}
246+
247+
return results
248+
249+
def parse_xml_output(input_dir, corenlp_path="stanford-corenlp-full-2013-04-04/", memory="3g"):
250+
"""Because interaction with the command-line interface of the CoreNLP
251+
tools is limited to very short text bits, it is necessary to parse xml
252+
output"""
253+
#First, we change to the directory where we place the xml files from the
254+
#parser:
255+
256+
xml_dir = tempfile.mkdtemp()
257+
file_list = tempfile.NamedTemporaryFile()
258+
259+
#we get a list of the cleaned files that we want to parse:
260+
261+
files = [input_dir+'/'+f for f in os.listdir(input_dir)]
262+
263+
#creating the file list of files to parse
264+
265+
file_list.write('\n'.join(files))
266+
file_list.seek(0)
136267

137-
class StanfordCoreNLP(object):
268+
command = init_corenlp_command(corenlp_path, memory)\
269+
+ ' -filelist %s -outputDirectory %s' % (file_list.name, xml_dir)
270+
271+
#creates the xml file of parser output:
272+
273+
os.system(command)
274+
275+
#reading in the raw xml file:
276+
try:
277+
for output_file in os.listdir(xml_dir):
278+
with open(xml_dir+'/'+output_file, 'r') as xml:
279+
parsed = xml.read()
280+
yield parse_parser_xml_results(parsed)
281+
finally:
282+
file_list.close()
283+
try:
284+
shutil.rmtree(xml_dir)
285+
except: pass
286+
287+
class StanfordCoreNLP:
138288
"""
139289
Command-line interaction with Stanford's CoreNLP java utilities.
140290
Can be run as a JSON-RPC server or imported as a module.
@@ -145,44 +295,8 @@ def __init__(self, corenlp_path="stanford-corenlp-full-2013-04-04/", memory="3g"
145295
Spawns the server as a process.
146296
"""
147297

148-
# TODO: Can edit jar constants
149-
# jars = ["stanford-corenlp-1.3.5.jar",
150-
# "stanford-corenlp-1.3.5-models.jar",
151-
# "joda-time.jar",
152-
# "xom.jar"]
153-
jars = ["stanford-corenlp-1.3.5.jar",
154-
"stanford-corenlp-1.3.5-models.jar",
155-
"xom.jar",
156-
"joda-time.jar",
157-
"jollyday.jar"]
158-
159-
java_path = "java"
160-
classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
161-
# include the properties file, so you can change defaults
162-
# but any changes in output format will break parse_parser_results()
163-
property_name = "default.properties"
164-
current_dir_pr = os.path.dirname(os.path.abspath( __file__ )) +"/"+ property_name
165-
if os.path.exists(property_name):
166-
props = "-props %s" % (property_name)
167-
elif os.path.exists(current_dir_pr):
168-
props = "-props %s" % (current_dir_pr)
169-
else:
170-
raise Exception("Error! Cannot locate: default.properties")
171-
172-
# add and check classpaths
173-
jars = [corenlp_path +"/"+ jar for jar in jars]
174-
for jar in jars:
175-
if not os.path.exists(jar):
176-
raise Exception("Error! Cannot locate: %s" % jar)
177-
178-
# add memory limit on JVM
179-
if memory:
180-
limit = "-Xmx%s" % memory
181-
else:
182-
limit = ""
183-
184298
# spawn the server
185-
start_corenlp = "%s %s -cp %s %s %s" % (java_path, limit, ':'.join(jars), classname, props)
299+
start_corenlp = init_corenlp_command(corenlp_path, memory)
186300
if VERBOSE: print start_corenlp
187301
self.corenlp = pexpect.spawn(start_corenlp)
188302

@@ -290,7 +404,22 @@ def parse(self, text):
290404
reads in the result, parses the results and returns a list
291405
with one dictionary entry for each parsed sentence, in JSON format.
292406
"""
293-
return json.dumps(self._parse(text))
407+
return json.dumps(self.raw_parse(text))
408+
409+
410+
def batch_parse(input_folder, corenlp_path="stanford-corenlp-full-2013-04-04/", memory="3g"):
411+
"""
412+
This function takes input files,
413+
sends list of input files to the Stanford parser,
414+
reads in the results from temporary folder in your OS and
415+
returns a generator object of list that consist of dictionary entry.
416+
( The function needs xmltodict,
417+
and doesn't need init 'StanfordCoreNLP' class. )
418+
"""
419+
if not os.path.exists(input_folder):
420+
raise Exception("Not exist input_folder")
421+
422+
return parse_xml_output(input_folder, corenlp_path, memory)
294423

295424

296425
if __name__ == '__main__':
@@ -305,6 +434,8 @@ def parse(self, text):
305434
help='Host to serve on (default localhost; 0.0.0.0 to make public)')
306435
parser.add_option('-S', '--corenlp', default="stanford-corenlp-full-2013-04-04",
307436
help='Stanford CoreNLP tool directory (default stanford-corenlp-full-2013-04-04/)')
437+
parser.add_option('-x', '--xml', action="store_true",
438+
help="Using XML format for read CoreNLP outputs (default false, but the option will be true on the future)")
308439
options, args = parser.parse_args()
309440
# server = jsonrpc.Server(jsonrpc.JsonRpc20(),
310441
# jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))

0 commit comments

Comments
 (0)