diff --git a/README.md b/README.md
index c1eddc5..c0c991f 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ Parameters: CORPUS_PATH: path to an existing corpus (CProject)
labels_to_get: SpaCy recognizes Named-Entites and labels them. You can choose for lables you are interested by providing it as a list. For all available labels, check out the Tools Used section.
```
## How to run?
-We have created `demo.py` where you can run the package.
+We have created `demo.py` in the `examples` folder where you can run the package.
```
import os
@@ -74,12 +74,12 @@ with open('GPE.text', 'w') as f:
f.write(str(list_with_gpe))
```
To break this down,
-|Variable snippet |What is it? |
-|----------------------|----------------|
-|`essential oil AND chemical composition` |Query to `pygetpapers` (EPMC default)|
-|`100` |number of hits |
-|stem_cell_research_300|Output directory|
-|"ethics_dictionary", "ethics_key_phrases", "ethics_key_phrases.xml" |dictionary path |
+| Variable snippet | What is it? |
+| ------------------------------------------------------------------- | ------------------------------------- |
+| `essential oil AND chemical composition` | Query to `pygetpapers` (EPMC default) |
+| `100` | number of hits |
+| stem_cell_research_300 | Output directory |
+| "ethics_dictionary", "ethics_key_phrases", "ethics_key_phrases.xml" | dictionary path |
## What is a dictionary
diff --git a/__init__.py b/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/docanalysis/__init__.py b/docanalysis/__init__.py
index fc80254..2ae2839 100644
--- a/docanalysis/__init__.py
+++ b/docanalysis/__init__.py
@@ -1 +1 @@
-pass
\ No newline at end of file
+pass
diff --git a/docanalysis/docanalysis.py b/docanalysis/docanalysis.py
index b52d7ec..2caea84 100644
--- a/docanalysis/docanalysis.py
+++ b/docanalysis/docanalysis.py
@@ -1,19 +1,21 @@
-import os
import logging
+import os
import sys
-import configargparse
-import coloredlogs
+from functools import partialmethod
from time import gmtime, strftime
+
+import coloredlogs
+import configargparse
from tqdm import tqdm
-from functools import partialmethod
+
from docanalysis.entity_extraction import EntityExtraction
-class Docanalysis:
+class Docanalysis:
def __init__(self):
"""This function makes all the constants"""
self.entity_extraction = EntityExtraction()
- self.version="0.0.3"
+ self.version = "0.0.3"
def handle_logger_creation(self, args):
"""[summary]
@@ -38,7 +40,7 @@ def handle_logger_creation(self, args):
if args.logfile:
self.handle_logfile(args, level)
else:
- coloredlogs.install(level=level, fmt='%(levelname)s: %(message)s')
+ coloredlogs.install(level=level, fmt="%(levelname)s: %(message)s")
def handlecli(self):
"""Handles the command line interface using argparse"""
@@ -85,7 +87,7 @@ def handlecli(self):
parser.add_argument(
"--entity_extraction",
default=False,
- nargs='+',
+ nargs="+",
help="extracts specified entities chosen from a list of entities (CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART, GGP, SO, TAXON, CHEBI, GO, CL)",
)
parser.add_argument(
@@ -121,7 +123,6 @@ def handlecli(self):
help="[All] save log to specified file in output directory as well as printing to terminal",
)
-
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit()
@@ -130,10 +131,19 @@ def handlecli(self):
if vars(args)[arg] == "False":
vars(args)[arg] = False
self.handle_logger_creation(args)
- self.entity_extraction.extract_entities_from_papers(args.project_name,args.dictionary,query=args.query,hits=args.hits,
- make_project=args.run_pygetpapers, install_ami=False, removefalse=True, create_csv=True,
- csv_name=args.output, labels_to_get=args.entity_extraction,make_ami_dict=args.make_ami_dict)
-
+ self.entity_extraction.extract_entities_from_papers(
+ args.project_name,
+ args.dictionary,
+ query=args.query,
+ hits=args.hits,
+ make_project=args.run_pygetpapers,
+ install_ami=False,
+ removefalse=True,
+ create_csv=True,
+ csv_name=args.output,
+ labels_to_get=args.entity_extraction,
+ make_ami_dict=args.make_ami_dict,
+ )
def main():
diff --git a/docanalysis/entity_extraction.py b/docanalysis/entity_extraction.py
index b4cce42..88659d6 100644
--- a/docanalysis/entity_extraction.py
+++ b/docanalysis/entity_extraction.py
@@ -1,19 +1,21 @@
-import os
import logging
+import os
+import xml.etree.ElementTree as ET
from glob import glob
-import spacy
+
import pandas as pd
+import spacy
from bs4 import BeautifulSoup
-from tqdm import tqdm
-import xml.etree.ElementTree as ET
from nltk import tokenize
+from tqdm import tqdm
try:
- nlp = spacy.load('en_core_web_sm')
+ nlp = spacy.load("en_core_web_sm")
except OSError:
from spacy.cli import download
- download('en_core_web_sm')
- nlp = spacy.load('en_core_web_sm')
+
+ download("en_core_web_sm")
+ nlp = spacy.load("en_core_web_sm")
class EntityExtraction:
@@ -23,15 +25,28 @@ def __init__(self):
self.labels_to_get = []
logging.basicConfig(level=logging.INFO)
- def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, hits=30,
- make_project=False, install_ami=False, removefalse=True, create_csv=True,
- csv_name='entities.csv', labels_to_get=['GPE', 'ORG'],make_ami_dict=False):
+ def extract_entities_from_papers(
+ self,
+ corpus_path,
+ terms_xml_path,
+ query=None,
+ hits=30,
+ make_project=False,
+ install_ami=False,
+ removefalse=True,
+ create_csv=True,
+ csv_name="entities.csv",
+ labels_to_get=["GPE", "ORG"],
+ make_ami_dict=False,
+ ):
self.labels_to_get = labels_to_get
if make_project:
if not query:
- logging.warning('Please provide query as parameter')
+ logging.warning("Please provide query as parameter")
return
- logging.info(f"making project/searching {query} for {hits} hits into {corpus_path}")
+ logging.info(
+ f"making project/searching {query} for {hits} hits into {corpus_path}"
+ )
self.create_project_files(query, hits, corpus_path)
if install_ami:
logging.info(f"installing ami3 (check whether this is a good idea)")
@@ -45,15 +60,19 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None,
if terms_xml_path:
terms = self.get_terms_from_ami_xml(terms_xml_path)
self.add_if_file_contains_terms(
- terms=terms, dict_with_parsed_xml=dict_with_parsed_xml)
+ terms=terms, dict_with_parsed_xml=dict_with_parsed_xml
+ )
if removefalse:
self.remove_statements_not_having_xmldict_terms_or_entities(
- dict_with_parsed_xml=dict_with_parsed_xml)
+ dict_with_parsed_xml=dict_with_parsed_xml
+ )
if create_csv:
self.convert_dict_to_csv(
- path=os.path.join(corpus_path, csv_name), dict_with_parsed_xml=dict_with_parsed_xml)
+ path=os.path.join(corpus_path, csv_name),
+ dict_with_parsed_xml=dict_with_parsed_xml,
+ )
if make_ami_dict:
- self.handle_ami_dict_creation(dict_with_parsed_xml,make_ami_dict)
+ self.handle_ami_dict_creation(dict_with_parsed_xml, make_ami_dict)
return dict_with_parsed_xml
def create_project_files(self, QUERY, HITS, OUTPUT):
@@ -68,8 +87,9 @@ def install_ami(self):
def make_dict_with_parsed_xml(self, output):
dict_with_parsed_xml = {}
- all_paragraphs = glob(os.path.join(
- output, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True)
+ all_paragraphs = glob(
+ os.path.join(output, "*", "sections", "**", "[1_9]_p.xml"), recursive=True
+ )
counter = 1
logging.info(f"starting tokenization on {len(all_paragraphs)} paragraphs")
for section_path in tqdm(all_paragraphs):
@@ -90,11 +110,10 @@ def read_text_from_path(self, paragraph_path):
tree = ET.parse(paragraph_path)
root = tree.getroot()
try:
- xmlstr = ET.tostring(root, encoding='utf8', method='xml')
- soup = BeautifulSoup(xmlstr, features='lxml')
+ xmlstr = ET.tostring(root, encoding="utf8", method="xml")
+ soup = BeautifulSoup(xmlstr, features="lxml")
text = soup.get_text(separator="")
- paragraph_text = text.replace(
- '\n', '')
+ paragraph_text = text.replace("\n", "")
except:
paragraph_text = "empty"
return paragraph_text
@@ -102,31 +121,36 @@ def read_text_from_path(self, paragraph_path):
def add_parsed_sections_to_dict(self, dict_with_parsed_xml):
for paragraph in dict_with_parsed_xml:
- doc = nlp(dict_with_parsed_xml[paragraph]['sentence'])
+ doc = nlp(dict_with_parsed_xml[paragraph]["sentence"])
entities, labels, position_end, position_start = self.make_required_lists()
for ent in doc.ents:
self.add_parsed_entities_to_lists(
- entities, labels, position_end, position_start, ent)
- self.add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end,
- position_start)
+ entities, labels, position_end, position_start, ent
+ )
+ self.add_lists_to_dict(
+ dict_with_parsed_xml[paragraph],
+ entities,
+ labels,
+ position_end,
+ position_start,
+ )
def add_if_file_contains_terms(self, terms, dict_with_parsed_xml):
for statement in dict_with_parsed_xml:
dict_for_sentence = dict_with_parsed_xml[statement]
- dict_for_sentence['has_terms'] = []
+ dict_for_sentence["has_terms"] = []
for term in terms:
- if term.lower().strip() in dict_for_sentence['sentence'].lower():
- dict_for_sentence['has_terms'].append(term)
- dict_for_sentence['weight'] = len(
- dict_for_sentence['has_terms'])
+ if term.lower().strip() in dict_for_sentence["sentence"].lower():
+ dict_for_sentence["has_terms"].append(term)
+ dict_for_sentence["weight"] = len(dict_for_sentence["has_terms"])
def get_terms_from_ami_xml(self, xml_path):
tree = ET.parse(xml_path)
root = tree.getroot()
terms = []
- for para in root.iter('entry'):
+ for para in root.iter("entry"):
terms.append(para.attrib["term"])
return terms
@@ -138,14 +162,18 @@ def make_required_lists(self):
position_end = []
return entities, labels, position_end, position_start
- def add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end, position_start):
+ def add_lists_to_dict(
+ self, dict_for_sentence, entities, labels, position_end, position_start
+ ):
- dict_for_sentence['entities'] = entities
- dict_for_sentence['labels'] = labels
- dict_for_sentence['position_start'] = position_start
- dict_for_sentence['position_end'] = position_end
+ dict_for_sentence["entities"] = entities
+ dict_for_sentence["labels"] = labels
+ dict_for_sentence["position_start"] = position_start
+ dict_for_sentence["position_end"] = position_end
- def add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None):
+ def add_parsed_entities_to_lists(
+ self, entities, labels, position_end, position_start, ent=None
+ ):
if ent.label_ in self.labels_to_get:
entities.append(ent)
labels.append(ent.label_)
@@ -158,20 +186,23 @@ def convert_dict_to_csv(self, path, dict_with_parsed_xml):
df = df.T
for col in df:
try:
- df[col] = df[col].astype(str).str.replace(
- "[", "").str.replace("]", "")
- df[col] = df[col].astype(str).str.replace(
- "'", "").str.replace("'", "")
+ df[col] = df[col].astype(str).str.replace("[", "").str.replace("]", "")
+ df[col] = df[col].astype(str).str.replace("'", "").str.replace("'", "")
except:
pass
- df.to_csv(path, encoding='utf-8', line_terminator='\r\n')
+ df.to_csv(path, encoding="utf-8", line_terminator="\r\n")
logging.info(f"wrote output to {path}")
- def remove_statements_not_having_xmldict_terms_or_entities(self, dict_with_parsed_xml):
+ def remove_statements_not_having_xmldict_terms_or_entities(
+ self, dict_with_parsed_xml
+ ):
statement_to_pop = []
for statement in dict_with_parsed_xml:
sentect_dict = dict_with_parsed_xml[statement]
- if len(sentect_dict['has_terms']) == 0 or len(sentect_dict['entities']) == 0:
+ if (
+ len(sentect_dict["has_terms"]) == 0
+ or len(sentect_dict["entities"]) == 0
+ ):
statement_to_pop.append(statement)
for term in statement_to_pop:
@@ -182,31 +213,31 @@ def extract_particular_fields(dict_with_parsed_xml, field):
field_list = []
for sentence in dict_with_parsed_xml:
sentect_dict = dict_with_parsed_xml[sentence]
- for entity, label in zip(sentect_dict['entities'], sentect_dict['labels']):
+ for entity, label in zip(sentect_dict["entities"], sentect_dict["labels"]):
if label == field:
if entity not in field_list:
field_list.append(entity)
return field_list
- def make_ami_dict_from_list(self,list_of_terms,title):
- xml_string=f'''
+ def make_ami_dict_from_list(self, list_of_terms, title):
+ xml_string = f"""
- '''
+ """
for term in list_of_terms:
- xml_string+=f'''
+ xml_string += f"""
- '''
- xml_string+=""
+ """
+ xml_string += ""
return xml_string
-
- def write_string_to_file(self,string_to_put,title):
- with open(f'{title}.xml',mode='w') as f:
+
+ def write_string_to_file(self, string_to_put, title):
+ with open(f"{title}.xml", mode="w") as f:
f.write(string_to_put)
-
- def handle_ami_dict_creation(self,result_dictionary,title):
- list_of_entities=[]
+
+ def handle_ami_dict_creation(self, result_dictionary, title):
+ list_of_entities = []
for entry in result_dictionary:
- if 'entities' in entry:
- list_of_entities+=entry['entities']
- xml_dict = self.make_ami_dict_from_list(list_of_entities,title)
- self.write_string_to_file(xml_dict,f'{title}.xml')
+ if "entities" in entry:
+ list_of_entities += entry["entities"]
+ xml_dict = self.make_ami_dict_from_list(list_of_entities, title)
+ self.write_string_to_file(xml_dict, f"{title}.xml")
diff --git a/docanalysis/extract_entities.py b/docanalysis/extract_entities.py
index 97ba3c5..29ffa8e 100644
--- a/docanalysis/extract_entities.py
+++ b/docanalysis/extract_entities.py
@@ -1,25 +1,25 @@
-from fileinput import filename
-import os
-import sys
+import json
import logging
+import os
+import re
+import xml.etree.ElementTree as ET
+from fileinput import filename
from glob import glob
-import spacy
+
import pandas as pd
+import spacy
+import yake
from bs4 import BeautifulSoup
-from tqdm import tqdm
-import xml.etree.ElementTree as ET
from nltk import tokenize
-import subprocess
-import scispacy
-import json
-import re
-import yake
+from tqdm import tqdm
+
try:
- nlp = spacy.load('en_core_web_sm')
+ nlp = spacy.load("en_core_web_sm")
except OSError:
from spacy.cli import download
- download('en_core_web_sm')
- nlp = spacy.load('en_core_web_sm')
+
+ download("en_core_web_sm")
+ nlp = spacy.load("en_core_web_sm")
class DocAnalysis:
@@ -29,9 +29,19 @@ def __init__(self):
self.labels_to_get = []
logging.basicConfig(level=logging.INFO)
- def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, hits=30,
- make_project=False, install_ami=False, removefalse=True, create_csv=True,
- csv_name='entities.csv', labels_to_get=['GPE', 'ORG']):
+ def extract_entities_from_papers(
+ self,
+ corpus_path,
+ terms_xml_path,
+ query=None,
+ hits=30,
+ make_project=False,
+ install_ami=False,
+ removefalse=True,
+ create_csv=True,
+ csv_name="entities.csv",
+ labels_to_get=["GPE", "ORG"],
+ ):
"""[summary]
:param query: [description]
@@ -58,9 +68,11 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None,
self.labels_to_get = labels_to_get
if make_project:
if not query:
- logging.warning('Please provide query as parameter')
+ logging.warning("Please provide query as parameter")
return
- logging.info(f"making project/searching {query} for {hits} hits into {corpus_path}")
+ logging.info(
+ f"making project/searching {query} for {hits} hits into {corpus_path}"
+ )
self.create_project_files(query, hits, corpus_path)
if install_ami:
logging.info(f"installing ami3 (check whether this is a good idea)")
@@ -80,14 +92,18 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None,
terms = self.get_terms_from_ami_xml(terms_xml_path) # moved from (1)
self.add_if_file_contains_terms(
- terms=terms, dict_with_parsed_xml=dict_with_parsed_xml)
+ terms=terms, dict_with_parsed_xml=dict_with_parsed_xml
+ )
if removefalse:
self.remove_statements_not_having_xmldict_terms_or_entities(
- dict_with_parsed_xml=dict_with_parsed_xml)
+ dict_with_parsed_xml=dict_with_parsed_xml
+ )
if create_csv:
self.convert_dict_to_csv(
- path=os.path.join(corpus_path, csv_name), dict_with_parsed_xml=dict_with_parsed_xml)
+ path=os.path.join(corpus_path, csv_name),
+ dict_with_parsed_xml=dict_with_parsed_xml,
+ )
return dict_with_parsed_xml
def create_project_files(self, QUERY, HITS, OUTPUT):
@@ -102,8 +118,9 @@ def install_ami(self):
def make_dict_with_parsed_xml(self, output):
dict_with_parsed_xml = {}
- all_paragraphs = glob(os.path.join(
- output, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True)
+ all_paragraphs = glob(
+ os.path.join(output, "*", "sections", "**", "[1_9]_p.xml"), recursive=True
+ )
counter = 1
logging.info(f"starting tokenization on {len(all_paragraphs)} paragraphs")
for section_path in tqdm(all_paragraphs):
@@ -124,11 +141,10 @@ def read_text_from_path(self, paragraph_path):
tree = ET.parse(paragraph_path)
root = tree.getroot()
try:
- xmlstr = ET.tostring(root, encoding='utf8', method='xml')
- soup = BeautifulSoup(xmlstr, features='lxml')
+ xmlstr = ET.tostring(root, encoding="utf8", method="xml")
+ soup = BeautifulSoup(xmlstr, features="lxml")
text = soup.get_text(separator="")
- paragraph_text = text.replace(
- '\n', '')
+ paragraph_text = text.replace("\n", "")
except:
paragraph_text = "empty"
return paragraph_text
@@ -136,31 +152,36 @@ def read_text_from_path(self, paragraph_path):
def add_parsed_sections_to_dict(self, dict_with_parsed_xml):
for paragraph in dict_with_parsed_xml:
- doc = nlp(dict_with_parsed_xml[paragraph]['sentence'])
+ doc = nlp(dict_with_parsed_xml[paragraph]["sentence"])
entities, labels, position_end, position_start = self.make_required_lists()
for ent in doc.ents:
self.add_parsed_entities_to_lists(
- entities, labels, position_end, position_start, ent)
- self.add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end,
- position_start)
+ entities, labels, position_end, position_start, ent
+ )
+ self.add_lists_to_dict(
+ dict_with_parsed_xml[paragraph],
+ entities,
+ labels,
+ position_end,
+ position_start,
+ )
def add_if_file_contains_terms(self, terms, dict_with_parsed_xml):
for statement in dict_with_parsed_xml:
dict_for_sentence = dict_with_parsed_xml[statement]
- dict_for_sentence['has_terms'] = []
+ dict_for_sentence["has_terms"] = []
for term in terms:
- if term.lower().strip() in dict_for_sentence['sentence'].lower():
- dict_for_sentence['has_terms'].append(term)
- dict_for_sentence['weight'] = len(
- dict_for_sentence['has_terms'])
+ if term.lower().strip() in dict_for_sentence["sentence"].lower():
+ dict_for_sentence["has_terms"].append(term)
+ dict_for_sentence["weight"] = len(dict_for_sentence["has_terms"])
def get_terms_from_ami_xml(self, xml_path):
tree = ET.parse(xml_path)
root = tree.getroot()
terms = []
- for para in root.iter('entry'):
+ for para in root.iter("entry"):
terms.append(para.attrib["term"])
return terms
@@ -172,14 +193,18 @@ def make_required_lists(self):
position_end = []
return entities, labels, position_end, position_start
- def add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end, position_start):
+ def add_lists_to_dict(
+ self, dict_for_sentence, entities, labels, position_end, position_start
+ ):
- dict_for_sentence['entities'] = entities
- dict_for_sentence['labels'] = labels
- dict_for_sentence['position_start'] = position_start
- dict_for_sentence['position_end'] = position_end
+ dict_for_sentence["entities"] = entities
+ dict_for_sentence["labels"] = labels
+ dict_for_sentence["position_start"] = position_start
+ dict_for_sentence["position_end"] = position_end
- def add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None):
+ def add_parsed_entities_to_lists(
+ self, entities, labels, position_end, position_start, ent=None
+ ):
if ent.label_ in self.labels_to_get:
entities.append(ent)
labels.append(ent.label_)
@@ -192,20 +217,23 @@ def convert_dict_to_csv(self, path, dict_with_parsed_xml):
df = df.T
for col in df:
try:
- df[col] = df[col].astype(str).str.replace(
- "[", "").str.replace("]", "")
- df[col] = df[col].astype(str).str.replace(
- "'", "").str.replace("'", "")
+ df[col] = df[col].astype(str).str.replace("[", "").str.replace("]", "")
+ df[col] = df[col].astype(str).str.replace("'", "").str.replace("'", "")
except:
pass
- df.to_csv(path, encoding='utf-8', line_terminator='\r\n')
+ df.to_csv(path, encoding="utf-8", line_terminator="\r\n")
logging.info(f"wrote output to {path}")
- def remove_statements_not_having_xmldict_terms_or_entities(self, dict_with_parsed_xml):
+ def remove_statements_not_having_xmldict_terms_or_entities(
+ self, dict_with_parsed_xml
+ ):
statement_to_pop = []
for statement in dict_with_parsed_xml:
sentect_dict = dict_with_parsed_xml[statement]
- if len(sentect_dict['has_terms']) == 0 or len(sentect_dict['entities']) == 0:
+ if (
+ len(sentect_dict["has_terms"]) == 0
+ or len(sentect_dict["entities"]) == 0
+ ):
statement_to_pop.append(statement)
for term in statement_to_pop:
@@ -225,38 +253,44 @@ def extract_particular_fields(dict_with_parsed_xml, field):
field_list = []
for sentence in dict_with_parsed_xml:
sentect_dict = dict_with_parsed_xml[sentence]
- for entity, label in zip(sentect_dict['entities'], sentect_dict['labels']):
+ for entity, label in zip(sentect_dict["entities"], sentect_dict["labels"]):
if label == field:
if entity not in field_list:
field_list.append(entity)
return field_list
- def make_ami_dict_from_list(self,list_of_terms,title):
- xml_string=f'''
+ def make_ami_dict_from_list(self, list_of_terms, title):
+ xml_string = f"""
- '''
+ """
for term in list_of_terms:
- xml_string+=f'''
+ xml_string += f"""
- '''
- xml_string+=""
+ """
+ xml_string += ""
return xml_string
-
- def write_string_to_file(self,string_to_put,title):
- with open(f'{title}.xml',mode='w') as f:
+
+ def write_string_to_file(self, string_to_put, title):
+ with open(f"{title}.xml", mode="w") as f:
f.write(string_to_put)
-# -------this section comes from metadata_analysis.py
+
+# -------this section comes from metadata_analysis.py
# (https://github.com/petermr/crops/blob/main/metadata_analysis/metadata_analysis.py)
metadata_dictionary = {}
+
def get_metadata_json(output_directory):
WORKING_DIRECTORY = os.getcwd()
- glob_results = glob.glob(os.path.join(WORKING_DIRECTORY,
- output_directory, "*", 'eupmc_result.json'))
+ glob_results = glob.glob(
+ os.path.join(WORKING_DIRECTORY, output_directory, "*", "eupmc_result.json")
+ )
metadata_dictionary["metadata_json"] = glob_results
- logging.info(f'metadata found for {len(metadata_dictionary["metadata_json"])} papers')
+ logging.info(
+ f'metadata found for {len(metadata_dictionary["metadata_json"])} papers'
+ )
+
def get_PMCIDS(metadata_dictionary=metadata_dictionary):
# gets PMCDIDs from metadata_JSON of individual papers.
@@ -264,35 +298,46 @@ def get_PMCIDS(metadata_dictionary=metadata_dictionary):
metadata_dictionary["PMCIDS"] = []
for metadata in metadata_dictionary["metadata_json"]:
- with open(metadata, encoding='utf-8') as f:
+ with open(metadata, encoding="utf-8") as f:
metadata_in_json = json.load(f)
try:
- metadata_dictionary["PMCIDS"].append(
- metadata_in_json["full"]["pmcid"])
+ metadata_dictionary["PMCIDS"].append(metadata_in_json["full"]["pmcid"])
except KeyError:
- metadata_dictionary["PMCIDS"].append('NaN')
- logging.info('getting PMCIDs')
+ metadata_dictionary["PMCIDS"].append("NaN")
+ logging.info("getting PMCIDs")
+
def parse_xml(output_directory, section, metadata_dictionary=metadata_dictionary):
- # gets the text from XML. Clubs all the paragraphs in the section into one.
+ # gets the text from XML. Clubs all the paragraphs in the section into one.
metadata_dictionary[f"{section}"] = []
for pmc in metadata_dictionary["PMCIDS"]:
paragraphs = []
- section_glob = glob.glob(os.path.join(os.getcwd(), output_directory,
- pmc, 'sections', '**', f'*{section}*', '**', '*.xml'),
- recursive=True)
+ section_glob = glob.glob(
+ os.path.join(
+ os.getcwd(),
+ output_directory,
+ pmc,
+ "sections",
+ "**",
+ f"*{section}*",
+ "**",
+ "*.xml",
+ ),
+ recursive=True,
+ )
for result in section_glob:
tree = ET.parse(result)
root = tree.getroot()
- xmlstr = ET.tostring(root, encoding='utf-8', method='xml')
- soup = BeautifulSoup(xmlstr, features='lxml')
+ xmlstr = ET.tostring(root, encoding="utf-8", method="xml")
+ soup = BeautifulSoup(xmlstr, features="lxml")
text = soup.get_text(separator="")
- text = text.replace('\n', '')
+ text = text.replace("\n", "")
paragraphs.append(text)
- concated_paragraph = ' '.join(paragraphs)
+ concated_paragraph = " ".join(paragraphs)
metadata_dictionary[f"{section}"].append(concated_paragraph)
logging.info(f"parsing {section} section")
+
def get_abstract(metadata_dictionary=metadata_dictionary):
# gets abstracts from the metadata json.
# We might want to get the abstract from the fulltext,
@@ -300,14 +345,14 @@ def get_abstract(metadata_dictionary=metadata_dictionary):
TAG_RE = re.compile(r"<[^>]+>")
metadata_dictionary["abstract"] = []
for metadata in metadata_dictionary["metadata_json"]:
- with open(metadata, encoding='utf-8') as f:
+ with open(metadata, encoding="utf-8") as f:
metadata_in_json = json.load(f)
try:
raw_abstract = metadata_in_json["full"]["abstractText"]
- abstract = TAG_RE.sub(' ', raw_abstract)
+ abstract = TAG_RE.sub(" ", raw_abstract)
metadata_dictionary["abstract"].append(abstract)
except KeyError:
- metadata_dictionary["abstract"].append('NaN')
+ metadata_dictionary["abstract"].append("NaN")
logging.info("getting the abstracts")
@@ -317,32 +362,36 @@ def get_keywords(metadata_dictionary=metadata_dictionary):
# since the format of the metadata JSON has changed from time to time.
metadata_dictionary["keywords"] = []
for metadata in metadata_dictionary["metadata_json"]:
- with open(metadata, encoding='utf-8') as f:
+ with open(metadata, encoding="utf-8") as f:
metadata_in_json = json.load(f)
try:
metadata_dictionary["keywords"].append(
- metadata_in_json["full"]["keywordList"]["keyword"])
+ metadata_in_json["full"]["keywordList"]["keyword"]
+ )
except KeyError:
metadata_dictionary["keywords"].append([])
logging.info("getting the keywords from metadata")
def key_phrase_extraction(section, metadata_dictionary=metadata_dictionary):
- # extracts keyphrases from the blob of texts of section specified for each paper using YAKE
+ # extracts keyphrases from the blob of texts of section specified for each paper using YAKE
metadata_dictionary["yake_keywords"] = []
for text in metadata_dictionary[f"{section}"]:
custom_kw_extractor = yake.KeywordExtractor(
- lan='en', n=2, top=10, features=None)
+ lan="en", n=2, top=10, features=None
+ )
keywords = custom_kw_extractor.extract_keywords(text)
keywords_list = []
for kw in keywords:
keywords_list.append(kw[0])
metadata_dictionary["yake_keywords"].append(keywords_list)
- logging.info(f'extracted key phrases from {section}')
+ logging.info(f"extracted key phrases from {section}")
-def get_organism(section,label_interested= 'TAXON', metadata_dictionary=metadata_dictionary):
- #nlp = spacy.load("en_ner_bionlp13cg_md")
+def get_organism(
+ section, label_interested="TAXON", metadata_dictionary=metadata_dictionary
+):
+ # nlp = spacy.load("en_ner_bionlp13cg_md")
nlp = spacy.load("en_core_sci_sm")
metadata_dictionary["entities"] = []
for sci_text in metadata_dictionary[f"{section}"]:
@@ -352,23 +401,28 @@ def get_organism(section,label_interested= 'TAXON', metadata_dictionary=metadata
if ent.label_ == label_interested:
entity.append(ent.text)
metadata_dictionary["entities"].append(entity)
- logging.info(F"NER using SciSpacy - looking for {label_interested}")
+ logging.info(f"NER using SciSpacy - looking for {label_interested}")
-def convert_to_csv(path='keywords_results_yake_organism_pmcid_tps_cam_ter_c.csv', metadata_dictionary=metadata_dictionary):
- # method borrowed from original docanalysis
+def convert_to_csv(
+ path="keywords_results_yake_organism_pmcid_tps_cam_ter_c.csv",
+ metadata_dictionary=metadata_dictionary,
+):
+ # method borrowed from original docanalysis
df = pd.DataFrame(metadata_dictionary)
- df.to_csv(path, encoding='utf-8', line_terminator='\r\n')
- logging.info(f'writing the keywords to {path}')
+ df.to_csv(path, encoding="utf-8", line_terminator="\r\n")
+ logging.info(f"writing the keywords to {path}")
-def convert_to_json(path='ethics_statement_2000.json', metadata_dictionary = metadata_dictionary):
+def convert_to_json(
+ path="ethics_statement_2000.json", metadata_dictionary=metadata_dictionary
+):
# converts the python dictionary containing output into a JSON file
json_file = json.dumps(metadata_dictionary)
- f = open(path,"w", encoding='ascii')
+ f = open(path, "w", encoding="ascii")
f.write(json_file)
f.close()
- logging.info(f'writing the dictionary to {path}')
+ logging.info(f"writing the dictionary to {path}")
def look_for_a_word(section, search_for="TPS", metadata_dictionary=metadata_dictionary):
@@ -377,12 +431,16 @@ def look_for_a_word(section, search_for="TPS", metadata_dictionary=metadata_dict
metadata_dictionary[f"{search_for}_match"] = []
for text in metadata_dictionary[f"{section}"]:
words = text.split(" ")
- match_list = ([s for s in words if f"{search_for}" in s])
- metadata_dictionary[f"{search_for}_match"] .append(match_list)
+ match_list = [s for s in words if f"{search_for}" in s]
+ metadata_dictionary[f"{search_for}_match"].append(match_list)
logging.info(f"looking for {search_for} in {section}")
-def look_for_next_word(section, search_for=["number:", "no.", "No.", "number" ], metadata_dictionary=metadata_dictionary):
+def look_for_next_word(
+ section,
+ search_for=["number:", "no.", "No.", "number"],
+ metadata_dictionary=metadata_dictionary,
+):
# chops the paragraph corresponding to a section into list of words
# gets the word next to the matched string.
metadata_dictionary[f"{search_for}_match"] = []
@@ -390,7 +448,9 @@ def look_for_next_word(section, search_for=["number:", "no.", "No.", "number" ],
words = text.split(" ")
words = iter(words)
try:
- match_list = ([next(words) for s in words if any(xs in s for xs in search_for)])
+ match_list = [
+ next(words) for s in words if any(xs in s for xs in search_for)
+ ]
metadata_dictionary[f"{search_for}_match"].append(match_list)
except StopIteration:
metadata_dictionary[f"{search_for}_match"].append([])
@@ -398,7 +458,9 @@ def look_for_next_word(section, search_for=["number:", "no.", "No.", "number" ],
logging.info(f"looking for {search_for} in {section}")
-def add_if_file_contains_terms(section, metadata_dictionary=metadata_dictionary, terms=['iNaturalist']):
+def add_if_file_contains_terms(
+ section, metadata_dictionary=metadata_dictionary, terms=["iNaturalist"]
+):
# method borrowed from original docanalysis
metadata_dictionary["terms"] = []
for term in terms:
@@ -406,30 +468,30 @@ def add_if_file_contains_terms(section, metadata_dictionary=metadata_dictionary,
if term.lower() in text.lower():
metadata_dictionary["terms"].append(term)
else:
- metadata_dictionary["terms"].append('NaN')
- logging.info(f'looking for term matches in {section}')
+ metadata_dictionary["terms"].append("NaN")
+ logging.info(f"looking for term matches in {section}")
# calling all the functions
-CPROJECT = os.path.join(os.path.expanduser('~'), 'ethics_statement_2000_generic')
-SECTION= 'ethic'
-#querying_pygetpapers_sectioning("inaturalist",'500',CPROJECT)
+CPROJECT = os.path.join(os.path.expanduser("~"), "ethics_statement_2000_generic")
+SECTION = "ethic"
+# querying_pygetpapers_sectioning("inaturalist",'500',CPROJECT)
get_metadata_json(CPROJECT)
get_PMCIDS()
parse_xml(CPROJECT, SECTION)
get_abstract()
get_keywords()
key_phrase_extraction(SECTION)
-#get_organism(SECTION)
+# get_organism(SECTION)
look_for_next_word(SECTION)
-#look_for_next_word(SECTION, search_for="C.")
-#look_for_next_word(SECTION, search_for='Citrus')
+# look_for_next_word(SECTION, search_for="C.")
+# look_for_next_word(SECTION, search_for='Citrus')
add_if_file_contains_terms(SECTION)
-convert_to_csv(f'ethics_{SECTION}2000.csv')
+convert_to_csv(f"ethics_{SECTION}2000.csv")
convert_to_json()
-# -------end of code section from metadata_analysis.py
+# -------end of code section from metadata_analysis.py
-#TODO intergrate metadata_analyis.py to original docanalysis;
-#TODO decide on functions we need from metadata_analysis.py
-#TODO write methods to create ami-dictionaries from extracted entites and keywords
+# TODO intergrate metadata_analyis.py to original docanalysis;
+# TODO decide on functions we need from metadata_analysis.py
+# TODO write methods to create ami-dictionaries from extracted entites and keywords
diff --git a/docanalysis/frequency_analysis.py b/docanalysis/frequency_analysis.py
index d899a9e..55e7811 100644
--- a/docanalysis/frequency_analysis.py
+++ b/docanalysis/frequency_analysis.py
@@ -1,23 +1,24 @@
-
-import xml.etree.ElementTree as ET
import os
+import xml.etree.ElementTree as ET
from collections import Counter
+
def get_terms_from_ami_xml(xml_path):
tree = ET.parse(xml_path)
root = tree.getroot()
terms = []
- for para in root.iter('entry'):
+ for para in root.iter("entry"):
terms.append(para.attrib["term"])
return terms
+
def frequency_counter(terms):
frequency = {}
# iterating over the list
for item in terms:
- # checking the element in dictionary
+ # checking the element in dictionary
if item in frequency:
# incrementing the counr
frequency[item] += 1
@@ -29,9 +30,12 @@ def frequency_counter(terms):
print(Counter(frequency).most_common())
-xml_path = os.path.join(os.getcwd(), 'ami_dict.xml')
+xml_path = os.path.join(os.getcwd(), "ami_dict.xml")
+
+
def main():
terms = get_terms_from_ami_xml(xml_path)
frequency_counter(terms)
-main()
\ No newline at end of file
+
+main()
diff --git a/demo.py b/examples/demo.py
similarity index 100%
rename from demo.py
rename to examples/demo.py
diff --git a/pmr_demo.py b/examples/pmr_demo.py
similarity index 100%
rename from pmr_demo.py
rename to examples/pmr_demo.py
diff --git a/setup.py b/setup.py
index 9ffe7ac..4ca57c9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,47 +1,51 @@
-#!/usr/bin/env python
# -*- coding: utf-8 -*-
-
try:
from setuptools import setup
except ImportError:
from distutils.core import setup
-import configparser
-import os
-with open('README.md') as readme_file:
+
+with open("README.md") as readme_file:
readme = readme_file.read()
-requirements = ['pygetpapers', 'pandas', 'spacy', 'numpy',
- 'matplotlib', 'tqdm', 'beautifulsoup4','nltk']
+requirements = [
+ "pygetpapers",
+ "pandas",
+ "spacy",
+ "numpy",
+ "matplotlib",
+ "tqdm",
+ "beautifulsoup4",
+ "nltk",
+]
setup(
- name='docanalysis',
+ name="docanalysis",
version="0.0.3",
- description='extract structured information from ethics paragraphs',
+ description="extract structured information from ethics paragraphs",
long_description=readme,
- author='Ayush Garg, Shweata N. Hegde',
- author_email='ayush@science.org.in',
- url='https://github.com/petermr/docanalysis',
+ author="Ayush Garg, Shweata N. Hegde",
+ author_email="ayush@science.org.in",
+ url="https://github.com/petermr/docanalysis",
packages=[
- 'pygetpapers',
+ "docanalysis",
],
- package_dir={'docanalysis':
- 'docanalysis'},
+ package_dir={"docanalysis": "docanalysis"},
include_package_data=True,
install_requires=requirements,
- license='Apache License',
+ extras={"dev": ["pytest", "pytest-cov"]},
+ license="Apache License",
zip_safe=False,
- keywords='research automation',
+ keywords=["research automation"],
classifiers=[
- 'Development Status :: 4 - Beta',
- 'Intended Audience :: Developers',
- 'License :: OSI Approved :: Apache Software License',
- 'Natural Language :: English',
- 'Programming Language :: Python :: 3.4',
- 'Programming Language :: Python :: 3.5',
- 'Programming Language :: Python :: 3.6',
- 'Programming Language :: Python :: 3.7',
- 'Programming Language :: Python :: 3.8',
- 'Programming Language :: Python :: 3.9',
-
+ "Development Status :: 4 - Beta",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: Apache Software License",
+ "Natural Language :: English",
+ "Programming Language :: Python :: 3.4",
+ "Programming Language :: Python :: 3.5",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
],
)