forked from TorHou/merge_pcr_duplicates
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_DNA_sequence.py
executable file
·91 lines (76 loc) · 2.6 KB
/
fetch_DNA_sequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import argparse
import pysam
import pandas
import logging
from Bio.Seq import Seq
####################
## ARGS INPUT ##
####################
tool_description = """
The tool takes in a bed6 or interval file and returns
sequences in as fasta.
By default output is written to source file location.
Example usage:
fetch_DNA_sequence.py interval-file -o output.file
"""
# parse command line arguments
parser = argparse.ArgumentParser(description=tool_description,
formatter_class=argparse.RawDescriptionHelpFormatter)
# positional arguments
parser.add_argument(
"interval_file",
help="Path to bed/interval file.")
parser.add_argument(
"genome_file",
help="Path to genome file (fasta).")
# optional arguments
parser.add_argument(
"-o", "--outfile",
help="Write results to this file.")
parser.add_argument(
"-d", "--debug",
help="Print lots of debugging information",
action="store_true")
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
else:
logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
logging.info("Parsed arguments:")
logging.info(" interval_file: '{}'".format(args.interval_file))
if args.outfile:
logging.info(" outfile: enabled writing to file")
logging.info(" outfile: '{}'".format(args.outfile))
logging.info("")
###################
## READ DATA ##
###################
print("[START]")
print("[NOTE] Read data")
# your interval data
file = args.interval_file
cl_regions = pandas.read_table(file, sep='\t', names=['chrom', 'start', 'stop', 'name', 'score', 'strand'])
# link to the reference genome in fasta format
fastafile = pysam.Fastafile(args.genome_file)
print("[NOTE] finish")
#####################
## WRITE OUTPUT ##
#####################
print("[NOTE] Write output")
# check if outputfile was defined
outfile_name = ""
if args.outfile:
outfile_name = args.outfile
else:
outfile_name = "test-data/sequences.fa"
sequence_file = open(outfile_name, 'w')
# get sequence for coordinates
for i in range(0,len(cl_regions)):
sequence_file.write(">{}_{}_{}_{}_{}\n".format(cl_regions['chrom'][i], str(cl_regions['start'][i]), str(cl_regions['stop'][i]), cl_regions['name'][i], cl_regions['strand'][i]))
seq_str = fastafile.fetch(str(cl_regions['chrom'][i]), int(cl_regions['start'][i]), int(cl_regions['stop'][i]))
seq = Seq(seq_str)
if(cl_regions['strand'][i] == "-"):
seq = seq.reverse_complement()
sequence_file.write(str(seq) + "\n")
sequence_file.close()
print("[FINISH]")