forked from TatyanaV/Genome_Sequencing_Bioinformatics_II
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5Vb.HierarchicalClustering.py
84 lines (66 loc) · 2.66 KB
/
5Vb.HierarchicalClustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
'''
CODE CHALLENGE: Implement HierarchicalClustering.
Input: An integer n, followed by an n x n distance matrix.
Output: The result of applying HierarchicalClustering to this distance matrix (using Davg),
with each newly created cluster listed on each line.
Sample Input:
4
0.00 20.0 9.00 11.0
20.0 0.00 17.0 11.0
9.00 17.0 0.00 8.00
11.0 11.0 8.00 0.00
7
0.00 0.74 0.85 0.54 0.83 0.92 0.89
0.74 0.00 1.59 1.35 1.20 1.48 1.55
0.85 1.59 0.00 0.63 1.13 0.69 0.73
0.54 1.35 0.63 0.00 0.66 0.43 0.88
0.83 1.20 1.13 0.66 0.00 0.72 0.55
0.92 1.48 0.69 0.43 0.72 0.00 0.80
0.89 1.55 0.73 0.88 0.55 0.80 0.00
https://github.com/chrisKingsley/Coursera_Bioinformatics_Algorithms2/blob/eb4c2e72e6b8e014baf5a213385edda089baf727/3_clustering/yeast.py
Sample Output:
4 6
5 7
3 4 6
1 2
5 7 3 4 6
1 2 5 7 3 4 6
https://github.com/chrisKingsley/Coursera_Bioinformatics_Algorithms2/blob/eb4c2e72e6b8e014baf5a213385edda089baf727/3_clustering/yeast.py
https://github.com/johnmerm/bioinfo/blob/e6a255a461ca83c58e56c83d4717bc13f8880c5d/src/main/java/bioinfo/yeast/hier_clustering.py
'''
from timeit import itertools
def dist(clust1,clust2,Data):
return sum([ sum([ Data[i][j] for j in clust2]) for i in clust1])/(len(clust1)*len(clust2))
def hier_clustering(Data,n):
gen = n
Clusters = {i:[i] for i in range(n)}
all_Clusters = {i:[i] for i in range(n)}
T = {i:[] for i in range(n)}
while len(Clusters)>1:
permuts = itertools.permutations(Clusters.keys(),2)
closest = min(permuts,key = lambda x:dist(Clusters[x[0]],Clusters[x[1]],Data))
Clusters[gen] = Clusters[closest[0]]+Clusters[closest[1]]
T[gen] = [closest[0],closest[1]]
all_Clusters[gen] = Clusters[closest[0]]+Clusters[closest[1]]
print (' '.join([str(c+1) for c in Clusters[closest[0]]])+" "+' '.join([str(c+1) for c in Clusters[closest[1]]]))
del Clusters[closest[0]]
del Clusters[closest[1]]
gen = gen+1
return T,all_Clusters
def test_hier_clustering():
n=7
Data = [[0.00, 0.74, 0.85, 0.54, 0.83, 0.92, 0.89],
[0.74, 0.00, 1.59, 1.35, 1.20, 1.48, 1.55],
[0.85, 1.59, 0.00, 0.63, 1.13, 0.69, 0.73],
[0.54, 1.35, 0.63, 0.00, 0.66, 0.43, 0.88],
[0.83, 1.20, 1.13, 0.66, 0.00, 0.72, 0.55],
[0.92, 1.48, 0.69, 0.43, 0.72, 0.00, 0.80],
[0.89, 1.55, 0.73, 0.88, 0.55, 0.80, 0.00]]
hier_clustering(Data, n)
def exam_hier_clustering():
al = list(open('distMat.txt'))
n = int( al[0].strip())
Data = [ [float(x) for x in a.strip().split(' ')] for a in al[1:] ]
hier_clustering(Data, n)
if __name__ == '__main__':
exam_hier_clustering()