forked from TatyanaV/Genome_Sequencing_Bioinformatics_II
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOverlapGraph.py
63 lines (45 loc) · 1.32 KB
/
OverlapGraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from builtins import dict, open
import sys
def build_prefix_graph(kmers):
graph = dict()
for kmer in kmers:
prefix = kmer[:-1]
if prefix in graph:
graph[prefix].append(kmer)
else:
graph[prefix] = [kmer]
return graph
def find_overlaps(kmers):
"""
:param kmers: list of kmers
:return: a dict of kmer -> list-of-overlaps-for-kmer
"""
# build dict of suffixes to kmers
graph = build_prefix_graph(kmers)
result = dict()
for kmer in kmers:
suffix = kmer[1:]
if kmer in result:
if suffix in graph:
result[kmer].extend(graph[suffix])
else:
if suffix in graph:
result[kmer] = graph[suffix]
return result
def main(argv=None):
"""
:param argv: the command line args
:return: nothing
"""
if argv is None:
argv = sys.argv
with open("kmers-2") as contents:
kmers = [line.rstrip('\n') for line in contents]
result = find_overlaps(kmers)
# for key, value in result.items():
# print(key + " -> " + " ".join(value))
with open("overlaps.txt", "w") as text_file:
for key, value in result.items():
text_file.write(key + " -> " + " ".join(value) + "\n")
if __name__ == "__main__":
sys.exit(main())