petermr · kjappelbaum · Feb 8, 2022 · Feb 8, 2022 · Feb 8, 2022
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ Parameters: CORPUS_PATH: path to an existing corpus (CProject)
             labels_to_get: SpaCy recognizes Named-Entites and labels them. You can choose for lables you are interested by providing it as a list. For all available labels, check out the Tools Used section. 
 ```
 ## How to run?
-We have created `demo.py` where you can run the package. 
+We have created `demo.py`  in the `examples` folder where you can run the package. 
 
 ```
 import os
@@ -74,12 +74,12 @@ with open('GPE.text', 'w') as f:
     f.write(str(list_with_gpe))
 ```
 To break this down, 
-|Variable snippet      |What is it?     |
-|----------------------|----------------|
-|`essential oil AND chemical composition` |Query to `pygetpapers` (EPMC default)|
-|`100`                 |number of hits  |
-|stem_cell_research_300|Output directory|
-|"ethics_dictionary", "ethics_key_phrases", "ethics_key_phrases.xml"     |dictionary path |
+| Variable snippet                                                    | What is it?                           |
+| ------------------------------------------------------------------- | ------------------------------------- |
+| `essential oil AND chemical composition`                            | Query to `pygetpapers` (EPMC default) |
+| `100`                                                               | number of hits                        |
+| stem_cell_research_300                                              | Output directory                      |
+| "ethics_dictionary", "ethics_key_phrases", "ethics_key_phrases.xml" | dictionary path                       |
 
 ## What is a dictionary
 

diff --git a/__init__.py b/__init__.py
diff --git a/docanalysis/__init__.py b/docanalysis/__init__.py
@@ -1 +1 @@
-pass
+pass
diff --git a/docanalysis/docanalysis.py b/docanalysis/docanalysis.py
@@ -1,19 +1,21 @@
-import os
 import logging
+import os
 import sys
-import configargparse
-import coloredlogs
+from functools import partialmethod
 from time import gmtime, strftime
+
+import coloredlogs
+import configargparse
 from tqdm import tqdm
-from functools import partialmethod
+
 from docanalysis.entity_extraction import EntityExtraction
 
-class Docanalysis:
 
+class Docanalysis:
     def __init__(self):
         """This function makes all the constants"""
         self.entity_extraction = EntityExtraction()
-        self.version="0.0.3"
+        self.version = "0.0.3"
 
     def handle_logger_creation(self, args):
         """[summary]
@@ -38,7 +40,7 @@ def handle_logger_creation(self, args):
         if args.logfile:
             self.handle_logfile(args, level)
         else:
-            coloredlogs.install(level=level, fmt='%(levelname)s: %(message)s')
+            coloredlogs.install(level=level, fmt="%(levelname)s: %(message)s")
 
     def handlecli(self):
         """Handles the command line interface using argparse"""
@@ -85,7 +87,7 @@ def handlecli(self):
         parser.add_argument(
             "--entity_extraction",
             default=False,
-            nargs='+',
+            nargs="+",
             help="extracts specified entities chosen from a list of entities (CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART, GGP, SO, TAXON, CHEBI, GO, CL)",
         )
         parser.add_argument(
@@ -121,7 +123,6 @@ def handlecli(self):
             help="[All] save log to specified file in output directory as well as printing to terminal",
         )
 
-
         if len(sys.argv) == 1:
             parser.print_help(sys.stderr)
             sys.exit()
@@ -130,10 +131,19 @@ def handlecli(self):
             if vars(args)[arg] == "False":
                 vars(args)[arg] = False
         self.handle_logger_creation(args)
-        self.entity_extraction.extract_entities_from_papers(args.project_name,args.dictionary,query=args.query,hits=args.hits,
-                                     make_project=args.run_pygetpapers, install_ami=False, removefalse=True, create_csv=True,
-                                     csv_name=args.output, labels_to_get=args.entity_extraction,make_ami_dict=args.make_ami_dict)
-
+        self.entity_extraction.extract_entities_from_papers(
+            args.project_name,
+            args.dictionary,
+            query=args.query,
+            hits=args.hits,
+            make_project=args.run_pygetpapers,
+            install_ami=False,
+            removefalse=True,
+            create_csv=True,
+            csv_name=args.output,
+            labels_to_get=args.entity_extraction,
+            make_ami_dict=args.make_ami_dict,
+        )
 
 
 def main():

diff --git a/docanalysis/entity_extraction.py b/docanalysis/entity_extraction.py
@@ -1,19 +1,21 @@
-import os
 import logging
+import os
+import xml.etree.ElementTree as ET
 from glob import glob
-import spacy
+
 import pandas as pd
+import spacy
 from bs4 import BeautifulSoup
-from tqdm import tqdm
-import xml.etree.ElementTree as ET
 from nltk import tokenize
+from tqdm import tqdm
 
 try:
-    nlp = spacy.load('en_core_web_sm')
+    nlp = spacy.load("en_core_web_sm")
 except OSError:
     from spacy.cli import download
-    download('en_core_web_sm')
-    nlp = spacy.load('en_core_web_sm')
+
+    download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
 
 
 class EntityExtraction:
@@ -23,15 +25,28 @@ def __init__(self):
         self.labels_to_get = []
         logging.basicConfig(level=logging.INFO)
 
-    def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, hits=30,
-                                     make_project=False, install_ami=False, removefalse=True, create_csv=True,
-                                     csv_name='entities.csv', labels_to_get=['GPE', 'ORG'],make_ami_dict=False):
+    def extract_entities_from_papers(
+        self,
+        corpus_path,
+        terms_xml_path,
+        query=None,
+        hits=30,
+        make_project=False,
+        install_ami=False,
+        removefalse=True,
+        create_csv=True,
+        csv_name="entities.csv",
+        labels_to_get=["GPE", "ORG"],
+        make_ami_dict=False,
+    ):
         self.labels_to_get = labels_to_get
         if make_project:
             if not query:
-                logging.warning('Please provide query as parameter')
+                logging.warning("Please provide query as parameter")
                 return
-            logging.info(f"making project/searching {query} for {hits} hits into {corpus_path}")
+            logging.info(
+                f"making project/searching {query} for {hits} hits into {corpus_path}"
+            )
             self.create_project_files(query, hits, corpus_path)
         if install_ami:
             logging.info(f"installing ami3 (check whether this is a good idea)")
@@ -45,15 +60,19 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None,
         if terms_xml_path:
             terms = self.get_terms_from_ami_xml(terms_xml_path)
             self.add_if_file_contains_terms(
-                terms=terms, dict_with_parsed_xml=dict_with_parsed_xml)
+                terms=terms, dict_with_parsed_xml=dict_with_parsed_xml
+            )
             if removefalse:
                 self.remove_statements_not_having_xmldict_terms_or_entities(
-                    dict_with_parsed_xml=dict_with_parsed_xml)
+                    dict_with_parsed_xml=dict_with_parsed_xml
+                )
         if create_csv:
             self.convert_dict_to_csv(
-                path=os.path.join(corpus_path, csv_name), dict_with_parsed_xml=dict_with_parsed_xml)
+                path=os.path.join(corpus_path, csv_name),
+                dict_with_parsed_xml=dict_with_parsed_xml,
+            )
         if make_ami_dict:
-            self.handle_ami_dict_creation(dict_with_parsed_xml,make_ami_dict)
+            self.handle_ami_dict_creation(dict_with_parsed_xml, make_ami_dict)
         return dict_with_parsed_xml
 
     def create_project_files(self, QUERY, HITS, OUTPUT):
@@ -68,8 +87,9 @@ def install_ami(self):
     def make_dict_with_parsed_xml(self, output):
 
         dict_with_parsed_xml = {}
-        all_paragraphs = glob(os.path.join(
-            output, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True)
+        all_paragraphs = glob(
+            os.path.join(output, "*", "sections", "**", "[1_9]_p.xml"), recursive=True
+        )
         counter = 1
         logging.info(f"starting  tokenization on {len(all_paragraphs)} paragraphs")
         for section_path in tqdm(all_paragraphs):
@@ -90,43 +110,47 @@ def read_text_from_path(self, paragraph_path):
         tree = ET.parse(paragraph_path)
         root = tree.getroot()
         try:
-            xmlstr = ET.tostring(root, encoding='utf8', method='xml')
-            soup = BeautifulSoup(xmlstr, features='lxml')
+            xmlstr = ET.tostring(root, encoding="utf8", method="xml")
+            soup = BeautifulSoup(xmlstr, features="lxml")
             text = soup.get_text(separator="")
-            paragraph_text = text.replace(
-                '\n', '')
+            paragraph_text = text.replace("\n", "")
         except:
             paragraph_text = "empty"
         return paragraph_text
 
     def add_parsed_sections_to_dict(self, dict_with_parsed_xml):
 
         for paragraph in dict_with_parsed_xml:
-            doc = nlp(dict_with_parsed_xml[paragraph]['sentence'])
+            doc = nlp(dict_with_parsed_xml[paragraph]["sentence"])
             entities, labels, position_end, position_start = self.make_required_lists()
             for ent in doc.ents:
                 self.add_parsed_entities_to_lists(
-                    entities, labels, position_end, position_start, ent)
-            self.add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end,
-                                   position_start)
+                    entities, labels, position_end, position_start, ent
+                )
+            self.add_lists_to_dict(
+                dict_with_parsed_xml[paragraph],
+                entities,
+                labels,
+                position_end,
+                position_start,
+            )
 
     def add_if_file_contains_terms(self, terms, dict_with_parsed_xml):
 
         for statement in dict_with_parsed_xml:
             dict_for_sentence = dict_with_parsed_xml[statement]
-            dict_for_sentence['has_terms'] = []
+            dict_for_sentence["has_terms"] = []
             for term in terms:
-                if term.lower().strip() in dict_for_sentence['sentence'].lower():
-                    dict_for_sentence['has_terms'].append(term)
-            dict_for_sentence['weight'] = len(
-                dict_for_sentence['has_terms'])
+                if term.lower().strip() in dict_for_sentence["sentence"].lower():
+                    dict_for_sentence["has_terms"].append(term)
+            dict_for_sentence["weight"] = len(dict_for_sentence["has_terms"])
 
     def get_terms_from_ami_xml(self, xml_path):
 
         tree = ET.parse(xml_path)
         root = tree.getroot()
         terms = []
-        for para in root.iter('entry'):
+        for para in root.iter("entry"):
             terms.append(para.attrib["term"])
         return terms
 
@@ -138,14 +162,18 @@ def make_required_lists(self):
         position_end = []
         return entities, labels, position_end, position_start
 
-    def add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end, position_start):
+    def add_lists_to_dict(
+        self, dict_for_sentence, entities, labels, position_end, position_start
+    ):
 
-        dict_for_sentence['entities'] = entities
-        dict_for_sentence['labels'] = labels
-        dict_for_sentence['position_start'] = position_start
-        dict_for_sentence['position_end'] = position_end
+        dict_for_sentence["entities"] = entities
+        dict_for_sentence["labels"] = labels
+        dict_for_sentence["position_start"] = position_start
+        dict_for_sentence["position_end"] = position_end
 
-    def add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None):
+    def add_parsed_entities_to_lists(
+        self, entities, labels, position_end, position_start, ent=None
+    ):
         if ent.label_ in self.labels_to_get:
             entities.append(ent)
             labels.append(ent.label_)
@@ -158,20 +186,23 @@ def convert_dict_to_csv(self, path, dict_with_parsed_xml):
         df = df.T
         for col in df:
             try:
-                df[col] = df[col].astype(str).str.replace(
-                    "[", "").str.replace("]", "")
-                df[col] = df[col].astype(str).str.replace(
-                    "'", "").str.replace("'", "")
+                df[col] = df[col].astype(str).str.replace("[", "").str.replace("]", "")
+                df[col] = df[col].astype(str).str.replace("'", "").str.replace("'", "")
             except:
                 pass
-        df.to_csv(path, encoding='utf-8', line_terminator='\r\n')
+        df.to_csv(path, encoding="utf-8", line_terminator="\r\n")
         logging.info(f"wrote output to {path}")
 
-    def remove_statements_not_having_xmldict_terms_or_entities(self, dict_with_parsed_xml):
+    def remove_statements_not_having_xmldict_terms_or_entities(
+        self, dict_with_parsed_xml
+    ):
         statement_to_pop = []
         for statement in dict_with_parsed_xml:
             sentect_dict = dict_with_parsed_xml[statement]
-            if len(sentect_dict['has_terms']) == 0 or len(sentect_dict['entities']) == 0:
+            if (
+                len(sentect_dict["has_terms"]) == 0
+                or len(sentect_dict["entities"]) == 0
+            ):
                 statement_to_pop.append(statement)
 
         for term in statement_to_pop:
@@ -182,31 +213,31 @@ def extract_particular_fields(dict_with_parsed_xml, field):
         field_list = []
         for sentence in dict_with_parsed_xml:
             sentect_dict = dict_with_parsed_xml[sentence]
-            for entity, label in zip(sentect_dict['entities'], sentect_dict['labels']):
+            for entity, label in zip(sentect_dict["entities"], sentect_dict["labels"]):
                 if label == field:
                     if entity not in field_list:
                         field_list.append(entity)
         return field_list
 
-    def make_ami_dict_from_list(self,list_of_terms,title):
-        xml_string=f'''<?xml version="1.0" encoding="UTF-8"?>
+    def make_ami_dict_from_list(self, list_of_terms, title):
+        xml_string = f"""<?xml version="1.0" encoding="UTF-8"?>
                             <dictionary title="{title}">
-                    '''
+                    """
         for term in list_of_terms:
-            xml_string+=f'''
+            xml_string += f"""
                         <entry term="{term}"/>
-            '''
-        xml_string+="</dictionary>"
+            """
+        xml_string += "</dictionary>"
         return xml_string
-    
-    def write_string_to_file(self,string_to_put,title):
-        with open(f'{title}.xml',mode='w') as f:
+
+    def write_string_to_file(self, string_to_put, title):
+        with open(f"{title}.xml", mode="w") as f:
             f.write(string_to_put)
-    
-    def handle_ami_dict_creation(self,result_dictionary,title):
-        list_of_entities=[]
+
+    def handle_ami_dict_creation(self, result_dictionary, title):
+        list_of_entities = []
         for entry in result_dictionary:
-            if 'entities' in entry:
-                list_of_entities+=entry['entities']
-        xml_dict = self.make_ami_dict_from_list(list_of_entities,title)
-        self.write_string_to_file(xml_dict,f'{title}.xml')
+            if "entities" in entry:
+                list_of_entities += entry["entities"]
+        xml_dict = self.make_ami_dict_from_list(list_of_entities, title)
+        self.write_string_to_file(xml_dict, f"{title}.xml")