changed module and class names to PEP8 standard. added blacklist script

michaelgruenstaeudl · Nov 16, 2020 · 441c95b · 441c95b
1 parent 4ef3c2c
commit 441c95b
Show file tree

Hide file tree

Showing 10 changed files with 129 additions and 33 deletions.
diff --git a/airpg/Article_Mining.py b/airpg/Article_Mining.py
@@ -1,10 +1,10 @@
 import logging
 import re
 
-class Article_Mining:
+class ArticleMining:
 
     def __init__(self, logger):
-        self.log = logger or logging.getLogger(__name__ + ".Article_Mining")
+        self.log = logger or logging.getLogger(__name__ + ".ArticleMining")
 
     def get_abstract_text(self, article):
         '''

diff --git a/airpg/Entrez_Interaction.py b/airpg/Entrez_Interaction.py
@@ -1,14 +1,14 @@
 import os, subprocess, logging
 import xml.etree.ElementTree as ET
-from airpg import fetchpubmed
+from airpg.airpg import fetch_pubmed
 import entrezpy.conduit
 from ete3 import NCBITaxa
 from datetime import date
 
-class Entrez_Interaction:
+class EntrezInteraction:
 
     def __init__(self, logger = None):
-        self.log = logger or logging.getLogger(__name__ + ".Entrez_Interaction")
+        self.log = logger or logging.getLogger(__name__ + ".EntrezInteraction")
 
     def retrieve_uids(self, query, min_date = None):
         '''
@@ -132,10 +132,10 @@ def fetch_pubmed_articles(self, mail, query):
         '''
         articles = None
         cond = entrezpy.conduit.Conduit(mail)
-        fetch_pubmed = cond.new_pipeline()
-        sid = fetch_pubmed.add_search({'db': 'pubmed', 'term': query, 'rettype': 'count'})
-        fid = fetch_pubmed.add_fetch({'retmode':'xml'}, dependency=sid, analyzer=fetchpubmed.PubMedAnalyzer())
-        a = cond.run(fetch_pubmed)
+        fetch_pipe = cond.new_pipeline()
+        sid = fetch_pipe.add_search({'db': 'pubmed', 'term': query, 'rettype': 'count'})
+        fid = fetch_pipe.add_fetch({'retmode':'xml'}, dependency=sid, analyzer=fetch_pubmed.PubMedAnalyzer())
+        a = cond.run(fetch_pipe)
         result = a.get_result()
         if result.size() >= 1:
             articles = result.pubmed_records

diff --git a/airpg/IR_Operations.py b/airpg/IR_Operations.py
@@ -3,10 +3,10 @@
 from Bio.SeqRecord import SeqRecord
 from Bio.SeqFeature import SeqFeature, FeatureLocation
 
-class IR_Operations:
+class IROperations:
 
 	def __init__(self, logger = None):
-		self.log = logger or logging.getLogger(__name__ + ".IR_Operations")
+		self.log = logger or logging.getLogger(__name__ + ".IROperations")
 
 	###############
 	# I/O methods #

diff --git a/airpg/Table_IO.py b/airpg/Table_IO.py
@@ -1,11 +1,10 @@
 import os, logging
 import pandas as pd
-from airpg import fetchpubmed
 
-class Table_IO:
+class TableIO:
 
 	def __init__(self, fp_entry_table, fp_ir_table = None, fp_blacklist = None, fp_duplicates = None, logger = None):
-		self.log = logger or logging.getLogger(__name__ + ".Table_IO")
+		self.log = logger or logging.getLogger(__name__ + ".TableIO")
 		self.entry_table = None
 		self.duplicates = {}
 		self.ir_table = None

diff --git a/airpg/fetchpubmed.py → airpg/fetch_pubmed.py b/airpg/fetchpubmed.py → airpg/fetch_pubmed.py
diff --git a/airpg/scripts/airgb_analyze.py → airpg/scripts/airpg_analyze.py b/airpg/scripts/airgb_analyze.py → airpg/scripts/airpg_analyze.py
@@ -55,10 +55,10 @@
 from fuzzywuzzy import fuzz
 from ete3 import NCBITaxa
 from pathlib import Path
-from airpg import Entrez_Interaction
-from airpg import Table_IO
-from airpg import Article_Mining
-from airpg import IR_Operations
+from airpg.airpg import entrez_interaction
+from airpg.airpg import table_io
+from airpg.airpg import article_mining
+from airpg.airpg import ir_operations
 import pandas as pd
 import os, argparse
 import tarfile, coloredlogs, logging
@@ -95,11 +95,11 @@ def main(args):
 		coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='INFO', logger=log)
 	mail = args.mail
 	query = args.query
-	iro = IR_Operations.IR_Operations(log)
-	EI = Entrez_Interaction.Entrez_Interaction(log)
+	iro = ir_operations.IROperations(log)
+	EI = entrez_interaction.EntrezInteraction(log)
 
   # STEP 2. Read in accession numbers to loop over
-	tio = Table_IO.Table_IO(args.infn, args.outfn, args.blacklist, logger = log)
+	tio = table_io.TableIO(args.infn, args.outfn, args.blacklist, logger = log)
 	tio.remove_blacklisted_entries()
 
 	accessions = list(tio.entry_table["ACCESSION"].values)
@@ -179,7 +179,7 @@ def main(args):
 			os.remove(fp_entry)
 
   # STEP 4. Check any accession for IR loss and remove from outlist if necessary
-	am = Article_Mining.Article_Mining(log)
+	am = article_mining.ArticleMining(log)
 	articles = EI.fetch_pubmed_articles(mail, query)
 	ncbi = NCBITaxa()
 	# Update database if it is older than 1 month

diff --git a/airpg/scripts/airpg_generate_blacklist.py b/airpg/scripts/airpg_generate_blacklist.py
@@ -0,0 +1,96 @@
+import os.path
+import argparse
+import coloredlogs, logging
+import time
+#import PlastomeIntegrityChecks as pic
+from ete3 import NCBITaxa
+from pathlib import Path
+from datetime import datetime
+
+# For suppressing console output
+import io
+from contextlib import redirect_stdout
+
+###############
+# AUTHOR INFO #
+###############
+__author__ = 'Michael Gruenstaeudl <m.gruenstaeudl@fu-berlin.de>, '\
+             'Tilman Mehl <tilmanmehl@zedat.fu-berlin.de>'
+__copyright__ = 'Copyright (C) 2019 Michael Gruenstaeudl and Tilman Mehl'
+__info__ = 'Create or append a list of species names that are proven to lack one or more inverted repeats in their plastid genome'
+__version__ = '2020.08.20.1800'
+
+#############
+# DEBUGGING #
+#############
+import ipdb
+# ipdb.set_trace()
+
+def get_irl_clade_species(ncbi):
+    species_ids = []
+    irl_clade_tree = ncbi.get_topology([ncbi.get_name_translator(['IRL clade'])['IRL clade'][0]])
+    for leaf in irl_clade_tree.iter_leaves():
+         species_ids.append(int(leaf.name))
+    species = set(ncbi.translate_to_names(species_ids))
+    return species
+
+def get_irl_clade_genera(ncbi):
+    genera_ids = []
+    irl_clade_tree = ncbi.get_topology([ncbi.get_name_translator(['IRL clade'])['IRL clade'][0]])
+    for node in irl_clade_tree.iter_descendants():
+        if ncbi.get_rank([int(node.name)])[int(node.name)] == "genus":
+            genera_ids.append(int(node.name))
+    genera = set(ncbi.translate_to_names(genera_ids))
+    return genera
+
+def read_blacklist(fp_blacklist):
+    '''
+    Read a file of blacklisted genera/species.
+    Params:
+     - fp_blacklist: file path to input file
+    '''
+    blacklist = set()
+    with open(fp_blacklist, "r") as fh_blacklist:
+        for line in [l.strip() for l in fh_blacklist.readlines()]:
+            if not line.startswith("#"):
+                blacklist.add(line)
+    return blacklist
+
+def append_blacklist(fp_blacklist, blacklist):
+    with open(fp_blacklist, "a") as fh_blacklist:
+        fh_blacklist.write("# Update on %s\n" % datetime.now().strftime("%Y-%m-%d, %H:%M"))
+        for entry in blacklist:
+            fh_blacklist.write(entry + "\n")
+
+def main(args):
+
+    ## STEP 1. Initialize variables
+    ncbi = NCBITaxa()
+    # Update database if it is older than one month
+    if (time.time() - os.path.getmtime(os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 2592000:
+        ncbi.update_taxonomy_database()
+    blacklist = set()
+    blacklist_existing = set()
+
+    ## STEP 2. Read blacklist if the file already exists
+    if os.path.isfile(args.file_blacklist):
+        print("Reading existing blacklist ...")
+        blacklist_existing = read_blacklist(args.file_blacklist)
+
+    ## STEP 3. Assemble species names of IRL clade of Fabaceae
+    print("\nFetching genus names of taxa in 'IRL clade' of Fabaceae ...")
+    irl_clade_genera = get_irl_clade_genera(ncbi)
+    print("  Adding new species names to blacklist ...")
+    blacklist = irl_clade_genera.difference(blacklist_existing)
+
+    ## STEP 4. Append only new taxon names to blacklist
+    print("\nCalculating and appending species names not previously in blacklist ...")
+    append_blacklist(args.file_blacklist, blacklist)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="  --  ".join([__author__, __copyright__, __info__, __version__]))
+    parser.add_argument("-f", "--file_blacklist", type=str, required=True, help="path to blacklist file")
+    parser.add_argument("-q", "--query", type=str, required=False, default="inverted[TITLE] AND repeat[TITLE] AND loss[TITLE]", help="query used to fetch PMC articles that will be scanned for species with missing IRs")
+    args = parser.parse_args()
+    main(args)
diff --git a/airpg/scripts/airgb_retrieve.py → airpg/scripts/airpg_retrieve.py b/airpg/scripts/airgb_retrieve.py → airpg/scripts/airpg_retrieve.py
@@ -36,8 +36,8 @@
 import argparse
 import coloredlogs, logging
 #import pandas as pd
-from airpg import Entrez_Interaction
-from airpg import Table_IO
+from airpg import entrez_interaction
+from airpg import table_io
 from datetime import datetime
 
 ###############
@@ -67,17 +67,17 @@ def main(args):
 	log = logging.getLogger(__name__)
 	coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='DEBUG', logger=log)
 
-	EI = Entrez_Interaction.Entrez_Interaction(log)
+	EI = entrez_interaction.EntrezInteraction(log)
 
   # STEP 2. Check if output file already exists, read existing UIDs, infer mindate
 	uids_already_processed = []
 	min_date = None
 	outfn = os.path.abspath(args.outfn)
 
 	if args.blacklist:
-		tio = Table_IO.Table_IO(outfn, fp_blacklist = args.blacklist, logger = log)
+		tio = table_io.TableIO(outfn, fp_blacklist = args.blacklist, logger = log)
 	else:
-		tio = Table_IO.Table_IO(outfn, logger = log)
+		tio = table_io.TableIO(outfn, logger = log)
 	fp_duplicates = os.path.join(os.path.dirname(outfn), os.path.basename(outfn) + ".duplicates")
 	if os.path.isfile(fp_duplicates):
 		tio.read_duplicates(fp_duplicates)

diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ biopython>=1.72
 argparse>=1.4.0
 ete3
 pandas
+entrezpy
diff --git a/setup.py b/setup.py
@@ -24,14 +24,14 @@
     python_requires='>=3.6',
     keywords='plastid genomes, inverted repeats, NCBI Nucleotide',
     license='BSD',
-    entry_points={
-        'console_scripts': [
-            'airpg_retrieve', 'airpg_analyze' # @TM: May need to be specified differently!
-        ],
-    },
+    entry_points='''
+    [console_scripts]
+    airpg_retrieve=AIRPG.scripts.airpg_retrieve:main
+    airpg_analyze=AIRPG.scripts.airpg_analyze:main
+    ''',
     packages=['airpg'], # So that the subfolder 'airpg' is read immediately.
     #packages = find_packages(),
-    install_requires=['biopython', 'ete3', 'argparse', 'pandas'],
+    install_requires=['biopython', 'ete3', 'entrezpy', 'pandas'],
     scripts=glob.glob('scripts/*'),
     test_suite='setup.my_test_suite',
     include_package_data=True,