-
Notifications
You must be signed in to change notification settings - Fork 1
/
find_ipg.py
49 lines (42 loc) · 1.47 KB
/
find_ipg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding: utf-8 -*-
"""
Created on Fri May 24 11:01:40 2019
@author: Maria Pessoa Monteiro
"""
import csv
from Bio import Entrez
def find_ipg(protACC_list, ipgfilename="ipg.csv"):
"""
Find_ipg accepts a protein accession number list as an argument and uses
EFetch to retrieve identical proteins. A csv file, by default named
ipg.csv, is saved with the results.
The function returns a dictionary with the keys being the corresponding
DNA ANs and the values a tuple with the start position, stop position
and coding strand (plus = 2, minus = 1), in that order.
"""
protACC = " ".join(protACC_list)
ipg = Entrez.efetch(db="protein", id=protACC, rettype='ipg',
retmode='text')
fetch_reader = csv.reader(ipg, delimiter="\t")
with open(ipgfilename, "w", newline='') as out_handle:
writer = csv.writer(out_handle)
writer.writerows(fetch_reader)
with open(ipgfilename) as ipgfile:
reader = csv.reader(ipgfile)
rows = list(reader)
rows = rows[1:]
dna = []
strand = []
start = []
stop = []
for col in rows:
if len(col[2]) > 3:
dna.append(col[2])
start.append(col[3])
stop.append(col[4])
if col[5] == "+":
strand.append(1)
else:
strand.append(2)
values = zip(start, stop, strand)
return dict(zip(dna, values))