From e72ce4074c67f05de9d320d785bdf06481fbc2da Mon Sep 17 00:00:00 2001 From: Ian MacFarland Date: Fri, 12 Jun 2015 17:40:01 -0700 Subject: [PATCH 1/4] handle different text output from depparse annotator --- corenlp/corenlp.py | 13 +++++-------- corenlp/default.properties | 6 +++++- 2 files changed, 10 insertions(+), 9 deletions(-) mode change 100755 => 100644 corenlp/corenlp.py diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py old mode 100755 new mode 100644 index 4592acc..f2c9a16 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -153,7 +153,8 @@ def parse_parser_results(text): """ results = {"sentences": []} state = STATE_START - for line in unidecode(text.decode('utf-8')).split("\n"): + lines = unidecode(text.decode('utf-8')).split("\n") + for index, line in enumerate(lines): line = line.strip() if line.startswith("Sentence #"): @@ -170,15 +171,11 @@ def parse_parser_results(text): raise ParserError('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): sentence['words'].append(parse_bracketed(s)) - state = STATE_TREE - - elif state == STATE_TREE: - if len(line) == 0: + if not lines[index + 1].startswith("[Text="): state = STATE_DEPENDENCY - sentence['parsetree'] = " ".join(sentence['parsetree']) - else: - sentence['parsetree'].append(remove_escapes(line)) + # skipping TREE because the new depparse annotator doesn't make a parse tree + elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE diff --git a/corenlp/default.properties b/corenlp/default.properties index c475c92..70ac093 100644 --- a/corenlp/default.properties +++ b/corenlp/default.properties @@ -1,4 +1,8 @@ -annotators = tokenize, ssplit, pos, lemma, parse +annotators = tokenize, ssplit, pos, lemma, depparse + +# specify Stanford Dependencies format for backwards compatibility +# (new default is Universal Dependencies in 3.5.2) +depparse.model = edu/stanford/nlp/models/parser/nndep/english_SD.gz # A true-casing annotator is also available (see below) #annotators = tokenize, ssplit, pos, lemma, truecase From 9bd284ec168aec7624c4916fd5e1afec55f587e9 Mon Sep 17 00:00:00 2001 From: Ian MacFarland Date: Fri, 12 Jun 2015 17:49:06 -0700 Subject: [PATCH 2/4] update README --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f0fb897..4771019 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # A Python wrapper for the Java Stanford Core NLP tools -This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server. +This is a Wordseer-specific fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server. ## Edited + * Tested only with the current annotator configuration: not a general-purpose wrapper + * Update to Stanford CoreNLP v3.5.2 * Added multi-threaded load balancing - * Update to Stanford CoreNLP v3.2.0 * Fix many bugs & improve performance * Using jsonrpclib for stability and performance * Can edit the constants as argument such as Stanford Core NLP directory @@ -164,4 +165,5 @@ The function uses XML output feature of Stanford CoreNLP, and you can take all i * Robert Elwell [robert@wikia-inc.com] * Tristan Chong [tristan@wikia-inc.com] * Aditi Muralidharan [aditi.shrikumar@gmail.com] + * Ian MacFarland [ianmacfarland@ischool.berkeley.edu] From 34ed4b6ff74e2da7892749b3c05c1de115fe282e Mon Sep 17 00:00:00 2001 From: Ian MacFarland Date: Fri, 12 Jun 2015 17:51:37 -0700 Subject: [PATCH 3/4] increment version --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index badbb0a..869427c 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,10 @@ PACKAGE = "corenlp" NAME = "stanford-corenlp-python" DESCRIPTION = "A Stanford Core NLP wrapper (wordseer fork)" -AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" +AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan, Ian MacFarland" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.9" +VERSION = "3.3.10" INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"] PEXPECT = "pexpect >= 2.4" From 6030814dc624b63ce2eef4b2fe0c88e12e002df8 Mon Sep 17 00:00:00 2001 From: Ian MacFarland Date: Sat, 13 Jun 2015 15:28:14 -0700 Subject: [PATCH 4/4] readme update --- README.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/README.md b/README.md index 4771019..e3a1400 100644 --- a/README.md +++ b/README.md @@ -22,15 +22,6 @@ This is a Wordseer-specific fork of Dustin Smith's [stanford-corenlp-python](htt To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the zip file containing Stanford's CoreNLP package. By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run. - -In other words: - - sudo pip install pexpect unidecode jsonrpclib # jsonrpclib is optional - git clone https://bitbucket.org/torotoki/corenlp-python.git - cd corenlp-python - wget http://nlp.stanford.edu/software/stanford-corenlp-full-2013-06-20.zip - unzip stanford-corenlp-full-2013-06-20.zip - Then, to launch a server: python corenlp/corenlp.py