Merge pull request #4 from Wordseer/update-to-coreNLP-3.5.2

macfarlandian · macfarlandian · commit eb071def8fd0 · 2015-06-13T15:46:56.000-07:00
Update to coreNLP 3.5.2
diff --git a/README.md b/README.md
@@ -1,10 +1,11 @@
 # A Python wrapper for the Java Stanford Core NLP tools
 
-This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server.
+This is a Wordseer-specific fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server.
 
 ## Edited
+   * Tested only with the current annotator configuration: not a general-purpose wrapper
+   * Update to Stanford CoreNLP v3.5.2
    * Added multi-threaded load balancing
-   * Update to Stanford CoreNLP v3.2.0
    * Fix many bugs & improve performance
    * Using jsonrpclib for stability and performance
    * Can edit the constants as argument such as Stanford Core NLP directory
@@ -21,15 +22,6 @@ This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/da
 
 To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the zip file containing Stanford's CoreNLP package.  By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run.
 
-
-In other words:
-
-    sudo pip install pexpect unidecode jsonrpclib   # jsonrpclib is optional
-    git clone https://bitbucket.org/torotoki/corenlp-python.git
-	  cd corenlp-python
-    wget http://nlp.stanford.edu/software/stanford-corenlp-full-2013-06-20.zip
-    unzip stanford-corenlp-full-2013-06-20.zip
-
 Then, to launch a server:
 
     python corenlp/corenlp.py
@@ -164,4 +156,5 @@ The function uses XML output feature of Stanford CoreNLP, and you can take all i
    * Robert Elwell [robert@wikia-inc.com]
    * Tristan Chong [tristan@wikia-inc.com]
    * Aditi Muralidharan [aditi.shrikumar@gmail.com]
+   * Ian MacFarland [ianmacfarland@ischool.berkeley.edu]
 
diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py
@@ -153,7 +153,8 @@ def parse_parser_results(text):
     """
     results = {"sentences": []}
     state = STATE_START
-    for line in unidecode(text.decode('utf-8')).split("\n"):
+    lines = unidecode(text.decode('utf-8')).split("\n")
+    for index, line in enumerate(lines):
         line = line.strip()
 
         if line.startswith("Sentence #"):
@@ -170,15 +171,11 @@ def parse_parser_results(text):
                 raise ParserError('Parse error. Could not find "[Text=" in: %s' % line)
             for s in WORD_PATTERN.findall(line):
                 sentence['words'].append(parse_bracketed(s))
-            state = STATE_TREE
-
-        elif state == STATE_TREE:
-            if len(line) == 0:
+            if not lines[index + 1].startswith("[Text="):
                 state = STATE_DEPENDENCY
-                sentence['parsetree'] = " ".join(sentence['parsetree'])
-            else:
-                sentence['parsetree'].append(remove_escapes(line))
+                # skipping TREE because the new depparse annotator doesn't make a parse tree
 
+        
         elif state == STATE_DEPENDENCY:
             if len(line) == 0:
                 state = STATE_COREFERENCE
diff --git a/corenlp/default.properties b/corenlp/default.properties
@@ -1,4 +1,8 @@
-annotators = tokenize, ssplit, pos, lemma, parse
+annotators = tokenize, ssplit, pos, lemma, depparse
+
+# specify Stanford Dependencies format for backwards compatibility
+# (new default is Universal Dependencies in 3.5.2)
+depparse.model = edu/stanford/nlp/models/parser/nndep/english_SD.gz
 
 # A true-casing annotator is also available (see below)
 #annotators = tokenize, ssplit, pos, lemma, truecase
diff --git a/setup.py b/setup.py
@@ -4,10 +4,10 @@
 PACKAGE = "corenlp"
 NAME = "stanford-corenlp-python"
 DESCRIPTION = "A Stanford Core NLP wrapper (wordseer fork)"
-AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan"
+AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan, Ian MacFarland"
 AUTHOR_EMAIL = "aditi.shrikumar@gmail.com"
 URL = "https://github.com/Wordseer/stanford-corenlp-python"
-VERSION = "3.3.9"
+VERSION = "3.3.10"
 INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"]
 
 PEXPECT = "pexpect >= 2.4"