Skip to content

Commit

Permalink
changed module and class names to PEP8 standard. added blacklist script
Browse files Browse the repository at this point in the history
  • Loading branch information
TilMeh committed Nov 16, 2020
1 parent 4ef3c2c commit 441c95b
Show file tree
Hide file tree
Showing 10 changed files with 129 additions and 33 deletions.
4 changes: 2 additions & 2 deletions airpg/Article_Mining.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import logging
import re

class Article_Mining:
class ArticleMining:

def __init__(self, logger):
self.log = logger or logging.getLogger(__name__ + ".Article_Mining")
self.log = logger or logging.getLogger(__name__ + ".ArticleMining")

def get_abstract_text(self, article):
'''
Expand Down
14 changes: 7 additions & 7 deletions airpg/Entrez_Interaction.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import os, subprocess, logging
import xml.etree.ElementTree as ET
from airpg import fetchpubmed
from airpg.airpg import fetch_pubmed
import entrezpy.conduit
from ete3 import NCBITaxa
from datetime import date

class Entrez_Interaction:
class EntrezInteraction:

def __init__(self, logger = None):
self.log = logger or logging.getLogger(__name__ + ".Entrez_Interaction")
self.log = logger or logging.getLogger(__name__ + ".EntrezInteraction")

def retrieve_uids(self, query, min_date = None):
'''
Expand Down Expand Up @@ -132,10 +132,10 @@ def fetch_pubmed_articles(self, mail, query):
'''
articles = None
cond = entrezpy.conduit.Conduit(mail)
fetch_pubmed = cond.new_pipeline()
sid = fetch_pubmed.add_search({'db': 'pubmed', 'term': query, 'rettype': 'count'})
fid = fetch_pubmed.add_fetch({'retmode':'xml'}, dependency=sid, analyzer=fetchpubmed.PubMedAnalyzer())
a = cond.run(fetch_pubmed)
fetch_pipe = cond.new_pipeline()
sid = fetch_pipe.add_search({'db': 'pubmed', 'term': query, 'rettype': 'count'})
fid = fetch_pipe.add_fetch({'retmode':'xml'}, dependency=sid, analyzer=fetch_pubmed.PubMedAnalyzer())
a = cond.run(fetch_pipe)
result = a.get_result()
if result.size() >= 1:
articles = result.pubmed_records
Expand Down
4 changes: 2 additions & 2 deletions airpg/IR_Operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation

class IR_Operations:
class IROperations:

def __init__(self, logger = None):
self.log = logger or logging.getLogger(__name__ + ".IR_Operations")
self.log = logger or logging.getLogger(__name__ + ".IROperations")

###############
# I/O methods #
Expand Down
5 changes: 2 additions & 3 deletions airpg/Table_IO.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import os, logging
import pandas as pd
from airpg import fetchpubmed

class Table_IO:
class TableIO:

def __init__(self, fp_entry_table, fp_ir_table = None, fp_blacklist = None, fp_duplicates = None, logger = None):
self.log = logger or logging.getLogger(__name__ + ".Table_IO")
self.log = logger or logging.getLogger(__name__ + ".TableIO")
self.entry_table = None
self.duplicates = {}
self.ir_table = None
Expand Down
File renamed without changes.
16 changes: 8 additions & 8 deletions airpg/scripts/airgb_analyze.py → airpg/scripts/airpg_analyze.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@
from fuzzywuzzy import fuzz
from ete3 import NCBITaxa
from pathlib import Path
from airpg import Entrez_Interaction
from airpg import Table_IO
from airpg import Article_Mining
from airpg import IR_Operations
from airpg.airpg import entrez_interaction
from airpg.airpg import table_io
from airpg.airpg import article_mining
from airpg.airpg import ir_operations
import pandas as pd
import os, argparse
import tarfile, coloredlogs, logging
Expand Down Expand Up @@ -95,11 +95,11 @@ def main(args):
coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='INFO', logger=log)
mail = args.mail
query = args.query
iro = IR_Operations.IR_Operations(log)
EI = Entrez_Interaction.Entrez_Interaction(log)
iro = ir_operations.IROperations(log)
EI = entrez_interaction.EntrezInteraction(log)

# STEP 2. Read in accession numbers to loop over
tio = Table_IO.Table_IO(args.infn, args.outfn, args.blacklist, logger = log)
tio = table_io.TableIO(args.infn, args.outfn, args.blacklist, logger = log)
tio.remove_blacklisted_entries()

accessions = list(tio.entry_table["ACCESSION"].values)
Expand Down Expand Up @@ -179,7 +179,7 @@ def main(args):
os.remove(fp_entry)

# STEP 4. Check any accession for IR loss and remove from outlist if necessary
am = Article_Mining.Article_Mining(log)
am = article_mining.ArticleMining(log)
articles = EI.fetch_pubmed_articles(mail, query)
ncbi = NCBITaxa()
# Update database if it is older than 1 month
Expand Down
96 changes: 96 additions & 0 deletions airpg/scripts/airpg_generate_blacklist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import os.path
import argparse
import coloredlogs, logging
import time
#import PlastomeIntegrityChecks as pic
from ete3 import NCBITaxa
from pathlib import Path
from datetime import datetime

# For suppressing console output
import io
from contextlib import redirect_stdout

###############
# AUTHOR INFO #
###############
__author__ = 'Michael Gruenstaeudl <m.gruenstaeudl@fu-berlin.de>, '\
'Tilman Mehl <tilmanmehl@zedat.fu-berlin.de>'
__copyright__ = 'Copyright (C) 2019 Michael Gruenstaeudl and Tilman Mehl'
__info__ = 'Create or append a list of species names that are proven to lack one or more inverted repeats in their plastid genome'
__version__ = '2020.08.20.1800'

#############
# DEBUGGING #
#############
import ipdb
# ipdb.set_trace()

def get_irl_clade_species(ncbi):
species_ids = []
irl_clade_tree = ncbi.get_topology([ncbi.get_name_translator(['IRL clade'])['IRL clade'][0]])
for leaf in irl_clade_tree.iter_leaves():
species_ids.append(int(leaf.name))
species = set(ncbi.translate_to_names(species_ids))
return species

def get_irl_clade_genera(ncbi):
genera_ids = []
irl_clade_tree = ncbi.get_topology([ncbi.get_name_translator(['IRL clade'])['IRL clade'][0]])
for node in irl_clade_tree.iter_descendants():
if ncbi.get_rank([int(node.name)])[int(node.name)] == "genus":
genera_ids.append(int(node.name))
genera = set(ncbi.translate_to_names(genera_ids))
return genera

def read_blacklist(fp_blacklist):
'''
Read a file of blacklisted genera/species.
Params:
- fp_blacklist: file path to input file
'''
blacklist = set()
with open(fp_blacklist, "r") as fh_blacklist:
for line in [l.strip() for l in fh_blacklist.readlines()]:
if not line.startswith("#"):
blacklist.add(line)
return blacklist

def append_blacklist(fp_blacklist, blacklist):
with open(fp_blacklist, "a") as fh_blacklist:
fh_blacklist.write("# Update on %s\n" % datetime.now().strftime("%Y-%m-%d, %H:%M"))
for entry in blacklist:
fh_blacklist.write(entry + "\n")

def main(args):

## STEP 1. Initialize variables
ncbi = NCBITaxa()
# Update database if it is older than one month
if (time.time() - os.path.getmtime(os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 2592000:
ncbi.update_taxonomy_database()
blacklist = set()
blacklist_existing = set()

## STEP 2. Read blacklist if the file already exists
if os.path.isfile(args.file_blacklist):
print("Reading existing blacklist ...")
blacklist_existing = read_blacklist(args.file_blacklist)

## STEP 3. Assemble species names of IRL clade of Fabaceae
print("\nFetching genus names of taxa in 'IRL clade' of Fabaceae ...")
irl_clade_genera = get_irl_clade_genera(ncbi)
print(" Adding new species names to blacklist ...")
blacklist = irl_clade_genera.difference(blacklist_existing)

## STEP 4. Append only new taxon names to blacklist
print("\nCalculating and appending species names not previously in blacklist ...")
append_blacklist(args.file_blacklist, blacklist)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=" -- ".join([__author__, __copyright__, __info__, __version__]))
parser.add_argument("-f", "--file_blacklist", type=str, required=True, help="path to blacklist file")
parser.add_argument("-q", "--query", type=str, required=False, default="inverted[TITLE] AND repeat[TITLE] AND loss[TITLE]", help="query used to fetch PMC articles that will be scanned for species with missing IRs")
args = parser.parse_args()
main(args)
10 changes: 5 additions & 5 deletions airpg/scripts/airgb_retrieve.py → airpg/scripts/airpg_retrieve.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@
import argparse
import coloredlogs, logging
#import pandas as pd
from airpg import Entrez_Interaction
from airpg import Table_IO
from airpg import entrez_interaction
from airpg import table_io
from datetime import datetime

###############
Expand Down Expand Up @@ -67,17 +67,17 @@ def main(args):
log = logging.getLogger(__name__)
coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='DEBUG', logger=log)

EI = Entrez_Interaction.Entrez_Interaction(log)
EI = entrez_interaction.EntrezInteraction(log)

# STEP 2. Check if output file already exists, read existing UIDs, infer mindate
uids_already_processed = []
min_date = None
outfn = os.path.abspath(args.outfn)

if args.blacklist:
tio = Table_IO.Table_IO(outfn, fp_blacklist = args.blacklist, logger = log)
tio = table_io.TableIO(outfn, fp_blacklist = args.blacklist, logger = log)
else:
tio = Table_IO.Table_IO(outfn, logger = log)
tio = table_io.TableIO(outfn, logger = log)
fp_duplicates = os.path.join(os.path.dirname(outfn), os.path.basename(outfn) + ".duplicates")
if os.path.isfile(fp_duplicates):
tio.read_duplicates(fp_duplicates)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ biopython>=1.72
argparse>=1.4.0
ete3
pandas
entrezpy
12 changes: 6 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
python_requires='>=3.6',
keywords='plastid genomes, inverted repeats, NCBI Nucleotide',
license='BSD',
entry_points={
'console_scripts': [
'airpg_retrieve', 'airpg_analyze' # @TM: May need to be specified differently!
],
},
entry_points='''
[console_scripts]
airpg_retrieve=AIRPG.scripts.airpg_retrieve:main
airpg_analyze=AIRPG.scripts.airpg_analyze:main
''',
packages=['airpg'], # So that the subfolder 'airpg' is read immediately.
#packages = find_packages(),
install_requires=['biopython', 'ete3', 'argparse', 'pandas'],
install_requires=['biopython', 'ete3', 'entrezpy', 'pandas'],
scripts=glob.glob('scripts/*'),
test_suite='setup.my_test_suite',
include_package_data=True,
Expand Down

0 comments on commit 441c95b

Please sign in to comment.