diff --git a/README.md b/README.md index f0fb897..e3a1400 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # A Python wrapper for the Java Stanford Core NLP tools -This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server. +This is a Wordseer-specific fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python), a Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either use as python package, or run as a JSON-RPC server. ## Edited + * Tested only with the current annotator configuration: not a general-purpose wrapper + * Update to Stanford CoreNLP v3.5.2 * Added multi-threaded load balancing - * Update to Stanford CoreNLP v3.2.0 * Fix many bugs & improve performance * Using jsonrpclib for stability and performance * Can edit the constants as argument such as Stanford Core NLP directory @@ -21,15 +22,6 @@ This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/da To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the zip file containing Stanford's CoreNLP package. By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run. - -In other words: - - sudo pip install pexpect unidecode jsonrpclib # jsonrpclib is optional - git clone https://bitbucket.org/torotoki/corenlp-python.git - cd corenlp-python - wget http://nlp.stanford.edu/software/stanford-corenlp-full-2013-06-20.zip - unzip stanford-corenlp-full-2013-06-20.zip - Then, to launch a server: python corenlp/corenlp.py @@ -164,4 +156,5 @@ The function uses XML output feature of Stanford CoreNLP, and you can take all i * Robert Elwell [robert@wikia-inc.com] * Tristan Chong [tristan@wikia-inc.com] * Aditi Muralidharan [aditi.shrikumar@gmail.com] + * Ian MacFarland [ianmacfarland@ischool.berkeley.edu] diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py old mode 100755 new mode 100644 index 4592acc..f2c9a16 --- a/corenlp/corenlp.py +++ b/corenlp/corenlp.py @@ -153,7 +153,8 @@ def parse_parser_results(text): """ results = {"sentences": []} state = STATE_START - for line in unidecode(text.decode('utf-8')).split("\n"): + lines = unidecode(text.decode('utf-8')).split("\n") + for index, line in enumerate(lines): line = line.strip() if line.startswith("Sentence #"): @@ -170,15 +171,11 @@ def parse_parser_results(text): raise ParserError('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): sentence['words'].append(parse_bracketed(s)) - state = STATE_TREE - - elif state == STATE_TREE: - if len(line) == 0: + if not lines[index + 1].startswith("[Text="): state = STATE_DEPENDENCY - sentence['parsetree'] = " ".join(sentence['parsetree']) - else: - sentence['parsetree'].append(remove_escapes(line)) + # skipping TREE because the new depparse annotator doesn't make a parse tree + elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE diff --git a/corenlp/default.properties b/corenlp/default.properties index c475c92..70ac093 100644 --- a/corenlp/default.properties +++ b/corenlp/default.properties @@ -1,4 +1,8 @@ -annotators = tokenize, ssplit, pos, lemma, parse +annotators = tokenize, ssplit, pos, lemma, depparse + +# specify Stanford Dependencies format for backwards compatibility +# (new default is Universal Dependencies in 3.5.2) +depparse.model = edu/stanford/nlp/models/parser/nndep/english_SD.gz # A true-casing annotator is also available (see below) #annotators = tokenize, ssplit, pos, lemma, truecase diff --git a/setup.py b/setup.py index badbb0a..869427c 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,10 @@ PACKAGE = "corenlp" NAME = "stanford-corenlp-python" DESCRIPTION = "A Stanford Core NLP wrapper (wordseer fork)" -AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan" +AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan, Ian MacFarland" AUTHOR_EMAIL = "aditi.shrikumar@gmail.com" URL = "https://github.com/Wordseer/stanford-corenlp-python" -VERSION = "3.3.9" +VERSION = "3.3.10" INSTALLATION_REQS = ["unidecode >= 0.04.12", "xmltodict >= 0.4.6"] PEXPECT = "pexpect >= 2.4"