-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster.py
103 lines (70 loc) · 2.43 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
import pandas as pd
import sklearn.cluster as sc
import seaborn as sns
import matplotlib.pyplot as plt
## project settings ===
from os.path import join
WORKDIR = '/home/sergio/Res_CIML/TLX3_project'
SCRIPTS = join(WORKDIR,'scripts')
DATADIR = join(WORKDIR,'data')
## =====================
def log2p1(x):
return np.log2(x + 1)
# test
# === Load expression table
tbl = pd.read_table(join(DATADIR,'tracks', 'TLX3vsRAG-results_genes.txt'), index_col=0)
# Filter genes (Note: this filter remove microRNA expression)
tbl = tbl[(tbl.padj < 0.05)].dropna()
# === Load gene names
names = pd.read_table(join(DATADIR,"tracks/annot_tracks/references/mm9/mm9_EnsemblTransc_GeneNames.txt"),
index_col=0,
header=0,
names=['GeneID', 'TransID', 'Gene_name'])
names = names.drop('TransID', axis=1).drop_duplicates()
names = names.loc[tbl.index]
assert names.shape[0] == tbl.shape[0]
tbl=names.join(tbl, how ='right')
tbl=tbl.sort_values('log2FoldChange', axis=0, ascending=False)
tbn = tbl[['Gene_name', 'R2.RAG1W.RAG1','RAGS.RAGZ','RAGZ','TLX3.1_1','TLX3.1_5','TLX3.1_P']]
tbn.set_index(keys=tbn.columns[0], inplace=True)
df = tbn.head(50)
mat = df.as_matrix()
#===Test========
#f, ax2 = plt.subplots(figsize=(9, 6))
#fig=plt.figure()
#fig = plt.figure(figsize=(6, 11)) #11
#ax2 = fig.add_axes()
#sns.clustermap(log2p1(df), cmap='RdBu_r', col_cluster=False, ax = ax2)
sns.clustermap(log2p1(df), cmap='RdBu_r', col_cluster=False, figsize=(6, 12))
fig = plt.gcf()
ax2 = fig.axes[2]
for txt in ax2.get_yticklabels():
txt.set_rotation(0)
for txt in ax2.get_xticklabels():
txt.set_rotation(90)
#===============
cl = 6
km = sc.KMeans(n_clusters=cl)
km.fit(mat)
labels = km.labels_
results = pd.DataFrame(data=labels, columns=['cluster'], index=df.index)
dfc = pd.concat([log2p1(df), results], axis=1)
dfs = dfc.sort_values('cluster', axis=0, ascending=True)
k = log2p1(mat.max())/(cl-1)
dfs['cluster'] = k*dfs['cluster']
# ==== Figures
ttl = 'Cluster test'
fig1 = plt.figure(figsize=(6, 11)) #11
#ax1 = fig1.add_subplot(111)
ax1 = sns.heatmap(dfs,
cmap='RdBu_r',
linewidths=0.000)
ax1.set_title('diffExpressed')
for txt in ax1.get_yticklabels():
txt.set_rotation(0)
for txt in ax1.get_xticklabels():
txt.set_rotation(90)
fig1.suptitle(ttl, size='x-large')
#sns.heatmap(dfs, cmap='RdBu_r')
plt.show()