-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasta_extract_from_list.py
91 lines (64 loc) · 2.66 KB
/
fasta_extract_from_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
from Bio import SeqIO
import argparse
import sys
from jakomics import colors, utilities
import jak_utils
# OPTIONS #####################################################################
parser = argparse.ArgumentParser(
description='Give it a multifasta input (-f) and a list.txt (-l) with identifiers, and it returns the sequences')
parser.add_argument('--in_dir',
help="Directory with fasta files",
required=False,
default="")
parser.add_argument('-f', '--files',
help="Paths to individual fasta files",
nargs='*',
required=False,
default=[])
parser.add_argument('-l', '--list',
help="List of headers to include",
required=True)
parser.add_argument('-o', '--out',
help="Fasta out",
required=True)
parser.add_argument('-c', '--column',
help="0-based column position",
type=int,
default=0,
required=False)
parser.add_argument('--quiet',
action='store_true',
help='display less')
args = parser.parse_args()
# FUNCTIONS ###################################################################
def main(fasta, list, column):
global found
original_sequences = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
df = pd.read_csv(list, sep="\t", header=None)
wanted_sequences = df.iloc[:, column]
for id in wanted_sequences:
if id in original_sequences:
found += 1
collected_sequences.append(original_sequences[id])
else:
no.append(id)
# MAIN ########################################################################
if __name__ == "__main__":
jak_utils.header()
collected_sequences = []
found = 0
no = []
fasta_files = utilities.get_files(args.files, args.in_dir, ['faa', 'fa', 'ffn', 'fasta'])
for fasta_file in fasta_files:
main(fasta_file.file_path, args.list, args.column)
print(f'{colors.bcolors.GREEN}Found = {found}{colors.bcolors.END}, {colors.bcolors.RED}NOT Found = {len(no)}{colors.bcolors.END}', end="\r", file=sys.stderr)
print(f'\nWriting results to {args.out}')
SeqIO.write(collected_sequences, args.out, "fasta")
if len(no) > 0:
print(f'Done {colors.bcolors.RED}(with errors){colors.bcolors.END}')
else:
print(f'{colors.bcolors.GREEN}Done!{colors.bcolors.END}')
if args.quiet == False:
for id in no:
print(f'{colors.bcolors.RED}Not Found: {id}{colors.bcolors.END}')