diff --git a/README.md b/README.md index f26e6e2..b0738d5 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,14 @@ will create different taxIDs. # Install +## Dependencies + +* numpy +* networkx +* taxonkit + +## Package + ### From pypi ``` diff --git a/bin/gtdb_to_diamond.py b/bin/gtdb_to_diamond.py index 80ab116..9b2df5b 100755 --- a/bin/gtdb_to_diamond.py +++ b/bin/gtdb_to_diamond.py @@ -56,93 +56,11 @@ help='Keep temporary output? (Default: %(default)s)') parser.add_argument('--version', action='version', version='0.0.1') +# logging logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) -def copy_nodes(infile, outdir): - """ - Simple copy of nodes.dmp file into the output directory - """ - logging.info('Read nodes.dmp file: {}'.format(infile)) - outfile = os.path.join(outdir, 'nodes.dmp') - if infile == outfile: - raise IOError('Input == Output: {} <=> {}'.format(infile, outfile)) - with open(infile) as inF, open(outfile, 'w') as outF: - for line in inF: - outF.write(line) - logging.info('File written: {}'.format(outfile)) - -def read_names_dmp(infile, outdir): - """ - Reading names.dmp file - """ - outfile = os.path.join(outdir, 'names.dmp') - regex = re.compile(r'\t\|\t') - regexp = re.compile(r'^[^_]+_|_') - names_dmp = {} - logging.info('Reading dumpfile: {}'.format(infile)) - with open(infile) as inF, open(outfile, 'w') as outF: - for line in inF: - line = regex.split(line.rstrip()) - if len(line) >= 2: - line[1] = regexp.sub('', line[1]) # accession - names_dmp[line[1]] = line[0] - outF.write('\t|\t'.join(line) + '\n') - - logging.info(' File written: {}'.format(outfile)) - msg = ' No. of accession<=>taxID pairs: {}' - logging.info(msg.format(len(names_dmp.keys()))) - return names_dmp - -def faa_gz_files(members): - """ - Getting .faa.gz files from the tarball - """ - for tarinfo in members: - for ext in ('.faa.gz', '.faa'): - if tarinfo.name.endswith(ext): - yield tarinfo - -def faa_gz_index(directory='.', extensions=['.faa', '.faa.gz']): - """ - Creating {accession:faa_file} index from extracted tarball files - """ - extensions = set(extensions) - regex = re.compile(r'(_protein\.faa\.gz|_protein\.faa)$') - regexp = re.compile(r'^[^_]+_|_') - found = {} - for dirpath, dirnames, files in os.walk(directory): - for name in files: - for ext in extensions: - if name.lower().endswith(ext): - accession = regexp.sub('', regex.sub('', name)) - found[accession] = os.path.join(dirpath, name) - continue - return found - -def uncomp_tarball(tarball_file, tmp_dir): - """ - Extracting info from the tarball - """ - # tmp dir - if os.path.isdir(tmp_dir): - shutil.rmtree(tmp_dir) - os.makedirs(tmp_dir) - # extracting tarball - logging.info('Extracting tarball: {}'.format(tarball_file)) - logging.info(' Extracting to: {}'.format(tmp_dir)) - tar = tarfile.open(tarball_file) - tar.extractall(path=tmp_dir, members=faa_gz_files(tar)) - tar.close() - # listing files - faa_files = faa_gz_index(tmp_dir, ['.faa', '.faa.gz']) - n_files = len(faa_files.keys()) - msg = ' No. of .faa(.gz) files: {}' - logging.info(msg.format(n_files)) - if n_files == 0: - logging.warning(' No .faa(.gz) files found!') - return faa_files - +# functions def accession2taxid(names_dmp, faa_files, outdir): """ Creating accession2taxid table @@ -189,7 +107,8 @@ def faa_merge(faa_files, outdir, gzip_out=False): outF.write(line) logging.info(' File written: {}'.format(outfile)) logging.info(' No. of seqs. written: {}'.format(seq_cnt)) - + +## main interface def main(args): """ Main interface @@ -197,11 +116,11 @@ def main(args): if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # copying nodes - copy_nodes(args.nodes_dmp, args.outdir) + gtdb2td.Dmp.copy_nodes(args.nodes_dmp, args.outdir) # reading in names.dmp - names_dmp = read_names_dmp(args.names_dmp, args.outdir) + names_dmp = gtdb2td.Dmp.read_names_dmp(args.names_dmp, args.outdir) # uncompressing tarball of faa fasta files - faa_files = uncomp_tarball(args.faa_tarball, args.tmpdir) + faa_files = gtdb2td.IO.uncomp_tarball(args.faa_tarball, args.tmpdir) # create accession2taxid accession2taxid(names_dmp, faa_files, args.outdir) # creating combined faa fasta @@ -210,7 +129,8 @@ def main(args): if not args.keep_temp and os.path.isdir(args.tmpdir): shutil.rmtree(args.tmpdir) logging.info('Temp-dir removed: {}'.format(args.tmpdir)) - + +# script main if __name__ == '__main__': args = parser.parse_args() main(args) diff --git a/bin/gtdb_to_taxdump.py b/bin/gtdb_to_taxdump.py index 65dac99..d8accbf 100755 --- a/bin/gtdb_to_taxdump.py +++ b/bin/gtdb_to_taxdump.py @@ -56,44 +56,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) - -def find_all_paths(self, start_vertex, end_vertex, path=[]): - """ find all paths from start_vertex to - end_vertex in graph """ - graph = self.__graph_dict - path = path + [start_vertex] - if start_vertex == end_vertex: - return [path] - if start_vertex not in graph: - return [] - paths = [] - for vertex in graph[start_vertex]: - if vertex not in path: - extended_paths = self.find_all_paths(vertex, - end_vertex, - path) - for p in extended_paths: - paths.append(p) - return paths - -def get_url_data(url): - """ - Downloading data from url; assuming gzip - """ - req = urllib.request.Request(url) - req.add_header('Accept-Encoding', 'gzip') - response = urllib.request.urlopen(req) - content = gzip.decompress(response.read()) - return content.splitlines() - def load_gtdb_tax(infile, graph): """ loading gtdb taxonomy & adding to DAG """ # url or file download/open try: - inF = get_url_data(infile) + inF = gtdb2td.Utils.get_url_data(infile) except (OSError,ValueError) as e: try: ftpstream = urllib.request.urlopen(infile) diff --git a/bin/lineage2taxid.py b/bin/lineage2taxid.py index 2645337..e7a3cd9 100755 --- a/bin/lineage2taxid.py +++ b/bin/lineage2taxid.py @@ -67,53 +67,6 @@ class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, parser.add_argument('--version', action='version', version='0.0.1') # functions -def load_dmp(names_dmp_file, nodes_dmp_file): - """ - Loading NCBI names/nodes dmp files as DAG - Arguments: - names_dmp_file : str, names.dmp file - nodes_dmp_file : str, nodes.dmp file - Return: - network.DiGraph object - """ - regex = re.compile(r'\t\|\t') - # nodes - logging.info('Loading file: {}'.format(names_dmp_file)) - idx = {} # {taxid : name} - with gtdb2td.Utils.Open(names_dmp_file) as inF: - for line in inF: - line = line.rstrip() - if line == '': - continue - line = regex.split(line) - idx[int(line[0])] = line[1].lower() - # names - logging.info('Loading file: {}'.format(nodes_dmp_file)) - G = nx.DiGraph() - G.add_node(0, rank = 'root', name = 'root') - with gtdb2td.Utils.Open(nodes_dmp_file) as inF: - for line in inF: - line = line.rstrip() - if line == '': - continue - line = regex.split(line) - taxid_child = int(line[0]) - taxid_parent = int(line[1]) - rank_child = line[2] - name_child = idx[taxid_child].lower() - name_parent = idx[taxid_parent].lower() - # adding node - G.add_node(taxid_child, rank=rank_child, name=name_child) - # adding edge - if taxid_parent == 1: - G.add_edge(0, taxid_child) - else: - G.add_edge(taxid_parent, taxid_child) - idx.clear() - logging.info(' No. of nodes: {}'.format(G.number_of_nodes())) - logging.info(' No. of edges: {}'.format(G.number_of_edges())) - return G - def lineage2taxid(lineage, G): lineage = lineage.split(';') for cls in lineage[::-1]: @@ -152,18 +105,20 @@ def parse_lineage_table(table_file, lineage_column, G, # status if i > 0 and (i+1) % 100 == 0: logging.info(' Records processed: {}'.format(i+1)) - + +## main interface def main(args): """ Main interface """ # loading dmp as DAG - G = load_dmp(args.names_dmp, args.nodes_dmp) + G = gtdb2td.Dmp.load_dmp(args.names_dmp, args.nodes_dmp) # lineage2taxid parse_lineage_table(args.table_file, args.lineage_column, G=G, taxid_column = args.taxid_column, taxid_rank_column = args.taxid_rank_column) - + +# script main if __name__ == '__main__': args = parser.parse_args() main(args) diff --git a/bin/ncbi-gtdb_map.py b/bin/ncbi-gtdb_map.py index 90b32a3..698eb90 100755 --- a/bin/ncbi-gtdb_map.py +++ b/bin/ncbi-gtdb_map.py @@ -133,54 +133,56 @@ logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) -def load_dmp(names_dmp_file, nodes_dmp_file, no_prefix=False): - """ - Loading NCBI names/nodes dmp files as DAG - Arguments: - names_dmp_file : str, names.dmp file - nodes_dmp_file : str, nodes.dmp file - Return: - network.DiGraph object - """ - regex = re.compile(r'\t\|\t') - # nodes - logging.info('Loading file: {}'.format(names_dmp_file)) - idx = {} # {taxid : name} - with open(names_dmp_file) as inF: - for line in inF: - line = line.rstrip() - if line == '': - continue - line = regex.split(line) - idx[int(line[0])] = line[1] - # names - logging.info('Loading file: {}'.format(nodes_dmp_file)) - G = nx.DiGraph() - G.add_node(0, rank = 'root', name = 'root') - with open(nodes_dmp_file) as inF: - for line in inF: - line = line.rstrip() - if line == '': - continue - line = regex.split(line) - taxid_child = int(line[0]) - taxid_parent = int(line[1]) - rank_child = line[2] - name_child = idx[taxid_child] - name_parent = idx[taxid_parent] - if rank_child == 'species': - name_child = 's__' + name_child - # adding node - G.add_node(taxid_child, rank=rank_child, name=name_child) - # adding edge - if taxid_parent == 1: - G.add_edge(0, taxid_child) - else: - G.add_edge(taxid_parent, taxid_child) - idx.clear() - logging.info(' No. of nodes: {}'.format(G.number_of_nodes())) - logging.info(' No. of edges: {}'.format(G.number_of_edges())) - return G +# functions + +# def load_dmp(names_dmp_file, nodes_dmp_file, no_prefix=False): +# """ +# Loading NCBI names/nodes dmp files as DAG +# Arguments: +# names_dmp_file : str, names.dmp file +# nodes_dmp_file : str, nodes.dmp file +# Return: +# network.DiGraph object +# """ +# regex = re.compile(r'\t\|\t') +# # nodes +# logging.info('Loading file: {}'.format(names_dmp_file)) +# idx = {} # {taxid : name} +# with open(names_dmp_file) as inF: +# for line in inF: +# line = line.rstrip() +# if line == '': +# continue +# line = regex.split(line) +# idx[int(line[0])] = line[1] +# # names +# logging.info('Loading file: {}'.format(nodes_dmp_file)) +# G = nx.DiGraph() +# G.add_node(0, rank = 'root', name = 'root') +# with open(nodes_dmp_file) as inF: +# for line in inF: +# line = line.rstrip() +# if line == '': +# continue +# line = regex.split(line) +# taxid_child = int(line[0]) +# taxid_parent = int(line[1]) +# rank_child = line[2] +# name_child = idx[taxid_child] +# name_parent = idx[taxid_parent] +# if rank_child == 'species': +# name_child = 's__' + name_child +# # adding node +# G.add_node(taxid_child, rank=rank_child, name=name_child) +# # adding edge +# if taxid_parent == 1: +# G.add_edge(0, taxid_child) +# else: +# G.add_edge(taxid_parent, taxid_child) +# idx.clear() +# logging.info(' No. of nodes: {}'.format(G.number_of_nodes())) +# logging.info(' No. of edges: {}'.format(G.number_of_edges())) +# return G def format_taxonomy(T, hierarchy, acc, no_prefix=False): """ @@ -594,7 +596,7 @@ def main(args): """ # loading ncbi dmp files if provided if args.names_dmp is not None and args.nodes_dmp is not None: - ncbi_tax = load_dmp(args.names_dmp, args.nodes_dmp) + ncbi_tax = gtdb2td.Dmp.load_dmp(args.names_dmp, args.nodes_dmp) else: ncbi_tax = None # loading the metadata as graphs diff --git a/gtdb2td/Dmp.py b/gtdb2td/Dmp.py new file mode 100644 index 0000000..c8b8211 --- /dev/null +++ b/gtdb2td/Dmp.py @@ -0,0 +1,104 @@ +# import +## batteries +import os +import re +import sys +import gzip +import glob +import shutil +import argparse +import logging +import urllib.request +import codecs +import tarfile +from collections import OrderedDict +## 3rd party +import networkx as nx +from networkx.algorithms.dag import descendants +from networkx.algorithms.lowest_common_ancestors import lowest_common_ancestor +from networkx.algorithms.shortest_paths.unweighted import bidirectional_shortest_path + +# functions +def copy_nodes(infile, outdir): + """ + Simple copy of nodes.dmp file into the output directory + """ + logging.info('Read nodes.dmp file: {}'.format(infile)) + outfile = os.path.join(outdir, 'nodes.dmp') + if infile == outfile: + raise IOError('Input == Output: {} <=> {}'.format(infile, outfile)) + with open(infile) as inF, open(outfile, 'w') as outF: + for line in inF: + outF.write(line) + logging.info('File written: {}'.format(outfile)) + +def read_names_dmp(infile, outdir): + """ + Reading names.dmp file + """ + outfile = os.path.join(outdir, 'names.dmp') + regex = re.compile(r'\t\|\t') + regexp = re.compile(r'^[^_]+_|_') + names_dmp = {} + logging.info('Reading dumpfile: {}'.format(infile)) + with open(infile) as inF, open(outfile, 'w') as outF: + for line in inF: + line = regex.split(line.rstrip()) + if len(line) >= 2: + line[1] = regexp.sub('', line[1]) # accession + names_dmp[line[1]] = line[0] + outF.write('\t|\t'.join(line) + '\n') + + logging.info(' File written: {}'.format(outfile)) + msg = ' No. of accession<=>taxID pairs: {}' + logging.info(msg.format(len(names_dmp.keys()))) + return names_dmp + +def load_dmp(names_dmp_file, nodes_dmp_file): + """ + Loading NCBI names/nodes dmp files as DAG + Arguments: + names_dmp_file : str, names.dmp file + nodes_dmp_file : str, nodes.dmp file + Return: + network.DiGraph object + """ + regex = re.compile(r'\t\|\t') + # nodes + logging.info('Loading file: {}'.format(names_dmp_file)) + idx = {} # {taxid : name} + with gtdb2td.Utils.Open(names_dmp_file) as inF: + for line in inF: + line = line.rstrip() + if line == '': + continue + line = regex.split(line) + idx[int(line[0])] = line[1].lower() + # names + logging.info('Loading file: {}'.format(nodes_dmp_file)) + G = nx.DiGraph() + G.add_node(0, rank = 'root', name = 'root') + with gtdb2td.Utils.Open(nodes_dmp_file) as inF: + for line in inF: + line = line.rstrip() + if line == '': + continue + line = regex.split(line) + taxid_child = int(line[0]) + taxid_parent = int(line[1]) + rank_child = line[2] + name_child = idx[taxid_child].lower() + name_parent = idx[taxid_parent].lower() + # adding node + G.add_node(taxid_child, rank=rank_child, name=name_child) + # adding edge + if taxid_parent == 1: + G.add_edge(0, taxid_child) + else: + G.add_edge(taxid_parent, taxid_child) + idx.clear() + logging.info(' No. of nodes: {}'.format(G.number_of_nodes())) + logging.info(' No. of edges: {}'.format(G.number_of_edges())) + return G + + diff --git a/gtdb2td/Graph.py b/gtdb2td/Graph.py index 833225c..cf36ef7 100644 --- a/gtdb2td/Graph.py +++ b/gtdb2td/Graph.py @@ -203,3 +203,23 @@ def append_tbl(self, table_file, join_column): outF.write('\t'.join(line) + '\n') logging.info('File written: {}'.format(out_file)) + def find_all_paths(self, start_vertex, end_vertex, path=[]): + """ + find all paths from start_vertex to end_vertex in graph + """ + graph = self.__graph_dict + path = path + [start_vertex] + if start_vertex == end_vertex: + return [path] + if start_vertex not in graph: + return [] + paths = [] + for vertex in graph[start_vertex]: + if vertex not in path: + extended_paths = self.find_all_paths(vertex, + end_vertex, + path) + for p in extended_paths: + paths.append(p) + return paths + diff --git a/gtdb2td/IO.py b/gtdb2td/IO.py new file mode 100644 index 0000000..634ad1b --- /dev/null +++ b/gtdb2td/IO.py @@ -0,0 +1,61 @@ +import os +import re +import sys +import gzip +import glob +import shutil +import argparse +import logging +import urllib.request +import codecs +import tarfile +from collections import OrderedDict + +def faa_gz_files(members): + """ + Getting .faa.gz files from the tarball + """ + for tarinfo in members: + for ext in ('.faa.gz', '.faa'): + if tarinfo.name.endswith(ext): + yield tarinfo + +def faa_gz_index(directory='.', extensions=['.faa', '.faa.gz']): + """ + Creating {accession:faa_file} index from extracted tarball files + """ + extensions = set(extensions) + regex = re.compile(r'(_protein\.faa\.gz|_protein\.faa)$') + regexp = re.compile(r'^[^_]+_|_') + found = {} + for dirpath, dirnames, files in os.walk(directory): + for name in files: + for ext in extensions: + if name.lower().endswith(ext): + accession = regexp.sub('', regex.sub('', name)) + found[accession] = os.path.join(dirpath, name) + continue + return found + +def uncomp_tarball(tarball_file, tmp_dir): + """ + Extracting info from the tarball + """ + # tmp dir + if os.path.isdir(tmp_dir): + shutil.rmtree(tmp_dir) + os.makedirs(tmp_dir) + # extracting tarball + logging.info('Extracting tarball: {}'.format(tarball_file)) + logging.info(' Extracting to: {}'.format(tmp_dir)) + tar = tarfile.open(tarball_file) + tar.extractall(path=tmp_dir, members=faa_gz_files(tar)) + tar.close() + # listing files + faa_files = faa_gz_index(tmp_dir, ['.faa', '.faa.gz']) + n_files = len(faa_files.keys()) + msg = ' No. of .faa(.gz) files: {}' + logging.info(msg.format(n_files)) + if n_files == 0: + logging.warning(' No .faa(.gz) files found!') + return faa_files diff --git a/gtdb2td/Utils.py b/gtdb2td/Utils.py index 1d4a4a3..22c4abe 100644 --- a/gtdb2td/Utils.py +++ b/gtdb2td/Utils.py @@ -30,3 +30,13 @@ def Open(infile, mode='rb'): return gzip.open(infile, mode) else: return open(infile, mode) + +def get_url_data(url): + """ + Downloading data from url; assuming gzip + """ + req = urllib.request.Request(url) + req.add_header('Accept-Encoding', 'gzip') + response = urllib.request.urlopen(req) + content = gzip.decompress(response.read()) + return content.splitlines() diff --git a/gtdb2td/__init__.py b/gtdb2td/__init__.py index 5f399e2..be29226 100644 --- a/gtdb2td/__init__.py +++ b/gtdb2td/__init__.py @@ -2,4 +2,6 @@ __author__ = "Nick Youngblut" from gtdb2td.Graph import Graph +from gtdb2td import Dmp +from gtdb2td import IO from gtdb2td import Utils