From 441c95b48db786ca4b9d57a28ac8f7ec0ae739bc Mon Sep 17 00:00:00 2001 From: TilMeh Date: Mon, 16 Nov 2020 16:21:03 +0100 Subject: [PATCH] changed module and class names to PEP8 standard. added blacklist script --- airpg/Article_Mining.py | 4 +- airpg/Entrez_Interaction.py | 14 +-- airpg/IR_Operations.py | 4 +- airpg/Table_IO.py | 5 +- airpg/{fetchpubmed.py => fetch_pubmed.py} | 0 .../{airgb_analyze.py => airpg_analyze.py} | 16 ++-- airpg/scripts/airpg_generate_blacklist.py | 96 +++++++++++++++++++ .../{airgb_retrieve.py => airpg_retrieve.py} | 10 +- requirements.txt | 1 + setup.py | 12 +-- 10 files changed, 129 insertions(+), 33 deletions(-) rename airpg/{fetchpubmed.py => fetch_pubmed.py} (100%) mode change 100755 => 100644 rename airpg/scripts/{airgb_analyze.py => airpg_analyze.py} (96%) mode change 100755 => 100644 create mode 100644 airpg/scripts/airpg_generate_blacklist.py rename airpg/scripts/{airgb_retrieve.py => airpg_retrieve.py} (96%) mode change 100755 => 100644 diff --git a/airpg/Article_Mining.py b/airpg/Article_Mining.py index 58c2017..866b6dd 100755 --- a/airpg/Article_Mining.py +++ b/airpg/Article_Mining.py @@ -1,10 +1,10 @@ import logging import re -class Article_Mining: +class ArticleMining: def __init__(self, logger): - self.log = logger or logging.getLogger(__name__ + ".Article_Mining") + self.log = logger or logging.getLogger(__name__ + ".ArticleMining") def get_abstract_text(self, article): ''' diff --git a/airpg/Entrez_Interaction.py b/airpg/Entrez_Interaction.py index 9671d39..0f48ab3 100755 --- a/airpg/Entrez_Interaction.py +++ b/airpg/Entrez_Interaction.py @@ -1,14 +1,14 @@ import os, subprocess, logging import xml.etree.ElementTree as ET -from airpg import fetchpubmed +from airpg.airpg import fetch_pubmed import entrezpy.conduit from ete3 import NCBITaxa from datetime import date -class Entrez_Interaction: +class EntrezInteraction: def __init__(self, logger = None): - self.log = logger or logging.getLogger(__name__ + ".Entrez_Interaction") + self.log = logger or logging.getLogger(__name__ + ".EntrezInteraction") def retrieve_uids(self, query, min_date = None): ''' @@ -132,10 +132,10 @@ def fetch_pubmed_articles(self, mail, query): ''' articles = None cond = entrezpy.conduit.Conduit(mail) - fetch_pubmed = cond.new_pipeline() - sid = fetch_pubmed.add_search({'db': 'pubmed', 'term': query, 'rettype': 'count'}) - fid = fetch_pubmed.add_fetch({'retmode':'xml'}, dependency=sid, analyzer=fetchpubmed.PubMedAnalyzer()) - a = cond.run(fetch_pubmed) + fetch_pipe = cond.new_pipeline() + sid = fetch_pipe.add_search({'db': 'pubmed', 'term': query, 'rettype': 'count'}) + fid = fetch_pipe.add_fetch({'retmode':'xml'}, dependency=sid, analyzer=fetch_pubmed.PubMedAnalyzer()) + a = cond.run(fetch_pipe) result = a.get_result() if result.size() >= 1: articles = result.pubmed_records diff --git a/airpg/IR_Operations.py b/airpg/IR_Operations.py index 5b1c2b5..1f1d078 100755 --- a/airpg/IR_Operations.py +++ b/airpg/IR_Operations.py @@ -3,10 +3,10 @@ from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation -class IR_Operations: +class IROperations: def __init__(self, logger = None): - self.log = logger or logging.getLogger(__name__ + ".IR_Operations") + self.log = logger or logging.getLogger(__name__ + ".IROperations") ############### # I/O methods # diff --git a/airpg/Table_IO.py b/airpg/Table_IO.py index 2c77488..08587c5 100755 --- a/airpg/Table_IO.py +++ b/airpg/Table_IO.py @@ -1,11 +1,10 @@ import os, logging import pandas as pd -from airpg import fetchpubmed -class Table_IO: +class TableIO: def __init__(self, fp_entry_table, fp_ir_table = None, fp_blacklist = None, fp_duplicates = None, logger = None): - self.log = logger or logging.getLogger(__name__ + ".Table_IO") + self.log = logger or logging.getLogger(__name__ + ".TableIO") self.entry_table = None self.duplicates = {} self.ir_table = None diff --git a/airpg/fetchpubmed.py b/airpg/fetch_pubmed.py old mode 100755 new mode 100644 similarity index 100% rename from airpg/fetchpubmed.py rename to airpg/fetch_pubmed.py diff --git a/airpg/scripts/airgb_analyze.py b/airpg/scripts/airpg_analyze.py old mode 100755 new mode 100644 similarity index 96% rename from airpg/scripts/airgb_analyze.py rename to airpg/scripts/airpg_analyze.py index ea34c34..c379594 --- a/airpg/scripts/airgb_analyze.py +++ b/airpg/scripts/airpg_analyze.py @@ -55,10 +55,10 @@ from fuzzywuzzy import fuzz from ete3 import NCBITaxa from pathlib import Path -from airpg import Entrez_Interaction -from airpg import Table_IO -from airpg import Article_Mining -from airpg import IR_Operations +from airpg.airpg import entrez_interaction +from airpg.airpg import table_io +from airpg.airpg import article_mining +from airpg.airpg import ir_operations import pandas as pd import os, argparse import tarfile, coloredlogs, logging @@ -95,11 +95,11 @@ def main(args): coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='INFO', logger=log) mail = args.mail query = args.query - iro = IR_Operations.IR_Operations(log) - EI = Entrez_Interaction.Entrez_Interaction(log) + iro = ir_operations.IROperations(log) + EI = entrez_interaction.EntrezInteraction(log) # STEP 2. Read in accession numbers to loop over - tio = Table_IO.Table_IO(args.infn, args.outfn, args.blacklist, logger = log) + tio = table_io.TableIO(args.infn, args.outfn, args.blacklist, logger = log) tio.remove_blacklisted_entries() accessions = list(tio.entry_table["ACCESSION"].values) @@ -179,7 +179,7 @@ def main(args): os.remove(fp_entry) # STEP 4. Check any accession for IR loss and remove from outlist if necessary - am = Article_Mining.Article_Mining(log) + am = article_mining.ArticleMining(log) articles = EI.fetch_pubmed_articles(mail, query) ncbi = NCBITaxa() # Update database if it is older than 1 month diff --git a/airpg/scripts/airpg_generate_blacklist.py b/airpg/scripts/airpg_generate_blacklist.py new file mode 100644 index 0000000..98c8909 --- /dev/null +++ b/airpg/scripts/airpg_generate_blacklist.py @@ -0,0 +1,96 @@ +import os.path +import argparse +import coloredlogs, logging +import time +#import PlastomeIntegrityChecks as pic +from ete3 import NCBITaxa +from pathlib import Path +from datetime import datetime + +# For suppressing console output +import io +from contextlib import redirect_stdout + +############### +# AUTHOR INFO # +############### +__author__ = 'Michael Gruenstaeudl , '\ + 'Tilman Mehl ' +__copyright__ = 'Copyright (C) 2019 Michael Gruenstaeudl and Tilman Mehl' +__info__ = 'Create or append a list of species names that are proven to lack one or more inverted repeats in their plastid genome' +__version__ = '2020.08.20.1800' + +############# +# DEBUGGING # +############# +import ipdb +# ipdb.set_trace() + +def get_irl_clade_species(ncbi): + species_ids = [] + irl_clade_tree = ncbi.get_topology([ncbi.get_name_translator(['IRL clade'])['IRL clade'][0]]) + for leaf in irl_clade_tree.iter_leaves(): + species_ids.append(int(leaf.name)) + species = set(ncbi.translate_to_names(species_ids)) + return species + +def get_irl_clade_genera(ncbi): + genera_ids = [] + irl_clade_tree = ncbi.get_topology([ncbi.get_name_translator(['IRL clade'])['IRL clade'][0]]) + for node in irl_clade_tree.iter_descendants(): + if ncbi.get_rank([int(node.name)])[int(node.name)] == "genus": + genera_ids.append(int(node.name)) + genera = set(ncbi.translate_to_names(genera_ids)) + return genera + +def read_blacklist(fp_blacklist): + ''' + Read a file of blacklisted genera/species. + Params: + - fp_blacklist: file path to input file + ''' + blacklist = set() + with open(fp_blacklist, "r") as fh_blacklist: + for line in [l.strip() for l in fh_blacklist.readlines()]: + if not line.startswith("#"): + blacklist.add(line) + return blacklist + +def append_blacklist(fp_blacklist, blacklist): + with open(fp_blacklist, "a") as fh_blacklist: + fh_blacklist.write("# Update on %s\n" % datetime.now().strftime("%Y-%m-%d, %H:%M")) + for entry in blacklist: + fh_blacklist.write(entry + "\n") + +def main(args): + + ## STEP 1. Initialize variables + ncbi = NCBITaxa() + # Update database if it is older than one month + if (time.time() - os.path.getmtime(os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 2592000: + ncbi.update_taxonomy_database() + blacklist = set() + blacklist_existing = set() + + ## STEP 2. Read blacklist if the file already exists + if os.path.isfile(args.file_blacklist): + print("Reading existing blacklist ...") + blacklist_existing = read_blacklist(args.file_blacklist) + + ## STEP 3. Assemble species names of IRL clade of Fabaceae + print("\nFetching genus names of taxa in 'IRL clade' of Fabaceae ...") + irl_clade_genera = get_irl_clade_genera(ncbi) + print(" Adding new species names to blacklist ...") + blacklist = irl_clade_genera.difference(blacklist_existing) + + ## STEP 4. Append only new taxon names to blacklist + print("\nCalculating and appending species names not previously in blacklist ...") + append_blacklist(args.file_blacklist, blacklist) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=" -- ".join([__author__, __copyright__, __info__, __version__])) + parser.add_argument("-f", "--file_blacklist", type=str, required=True, help="path to blacklist file") + parser.add_argument("-q", "--query", type=str, required=False, default="inverted[TITLE] AND repeat[TITLE] AND loss[TITLE]", help="query used to fetch PMC articles that will be scanned for species with missing IRs") + args = parser.parse_args() + main(args) diff --git a/airpg/scripts/airgb_retrieve.py b/airpg/scripts/airpg_retrieve.py old mode 100755 new mode 100644 similarity index 96% rename from airpg/scripts/airgb_retrieve.py rename to airpg/scripts/airpg_retrieve.py index 47d0f4e..0daa1c5 --- a/airpg/scripts/airgb_retrieve.py +++ b/airpg/scripts/airpg_retrieve.py @@ -36,8 +36,8 @@ import argparse import coloredlogs, logging #import pandas as pd -from airpg import Entrez_Interaction -from airpg import Table_IO +from airpg import entrez_interaction +from airpg import table_io from datetime import datetime ############### @@ -67,7 +67,7 @@ def main(args): log = logging.getLogger(__name__) coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='DEBUG', logger=log) - EI = Entrez_Interaction.Entrez_Interaction(log) + EI = entrez_interaction.EntrezInteraction(log) # STEP 2. Check if output file already exists, read existing UIDs, infer mindate uids_already_processed = [] @@ -75,9 +75,9 @@ def main(args): outfn = os.path.abspath(args.outfn) if args.blacklist: - tio = Table_IO.Table_IO(outfn, fp_blacklist = args.blacklist, logger = log) + tio = table_io.TableIO(outfn, fp_blacklist = args.blacklist, logger = log) else: - tio = Table_IO.Table_IO(outfn, logger = log) + tio = table_io.TableIO(outfn, logger = log) fp_duplicates = os.path.join(os.path.dirname(outfn), os.path.basename(outfn) + ".duplicates") if os.path.isfile(fp_duplicates): tio.read_duplicates(fp_duplicates) diff --git a/requirements.txt b/requirements.txt index 881346a..ce5a197 100755 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ biopython>=1.72 argparse>=1.4.0 ete3 pandas +entrezpy diff --git a/setup.py b/setup.py index 4c95888..3530aea 100755 --- a/setup.py +++ b/setup.py @@ -24,14 +24,14 @@ python_requires='>=3.6', keywords='plastid genomes, inverted repeats, NCBI Nucleotide', license='BSD', - entry_points={ - 'console_scripts': [ - 'airpg_retrieve', 'airpg_analyze' # @TM: May need to be specified differently! - ], - }, + entry_points=''' + [console_scripts] + airpg_retrieve=AIRPG.scripts.airpg_retrieve:main + airpg_analyze=AIRPG.scripts.airpg_analyze:main + ''', packages=['airpg'], # So that the subfolder 'airpg' is read immediately. #packages = find_packages(), - install_requires=['biopython', 'ete3', 'argparse', 'pandas'], + install_requires=['biopython', 'ete3', 'entrezpy', 'pandas'], scripts=glob.glob('scripts/*'), test_suite='setup.my_test_suite', include_package_data=True,