-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexon_mapper.py
56 lines (46 loc) · 1.67 KB
/
exon_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/python3
from dataclasses import replace
import os, argparse, subprocess
import numpy as np
import pandas as pd
from Bio import SeqIO
parser = argparse.ArgumentParser()
parser.add_argument("--outdir")
parser.add_argument("--gene")
parser.add_argument("--file_ending")
args = parser.parse_args()
outdir = args.outdir
gene = args.gene
file_ending = args.file_ending
#Creating output file ending
output_file_ending = file_ending.replace(".fasta", "_mapped.fasta" )
# import recovery statistics
df = pd.read_csv("../../03_hybpiper/seq_lengths.txt", sep="\t")
df = df.drop(0)
# for each aligned gene, identify the two samples with the highest recovery stats
#gene = fn.split("_")[1]
if gene in df.columns:
df_red = df[['Species',gene]].sort_values(gene, ascending=False)
df_red = df_red.reset_index()
# Find the correct gene in the fasta file and save it as exon1
sp = df_red["Species"][0]
with open("../../09_mapping/"+gene+"_aligned.fasta", "r") as fasta_file:
for record in SeqIO.parse(fasta_file, "fasta"):
if record.id.startswith("exon1"):
exon1 = record
break
exon1.id = "exon1"
# second sample
sp = df_red["Species"][1]
with open("../../09_mapping/"+gene+"_aligned.fasta", "r") as fasta_file:
for record in SeqIO.parse(fasta_file, "fasta"):
if record.id.startswith("exon2"):
exon2 = record
break
exon2.id = "exon2"
with open(gene+"_temp.fasta", "w") as output_handle:
SeqIO.write([exon1, exon2], output_handle, "fasta")
subprocess.call("mafft --keeplength --add "+gene+"_temp.fasta "+gene+file_ending+" > "+outdir+"/"+gene+output_file_ending,shell=True)
subprocess.call("rm "+gene+"_temp.fasta", shell=True)
else:
print(gene+" not there!!!!")