-
Notifications
You must be signed in to change notification settings - Fork 3
/
hypergeometric_GO.py
114 lines (92 loc) · 3.84 KB
/
hypergeometric_GO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# hypergeometric test for the top 10 hits gene sets and the true gene set
import pandas as pd
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
from tqdm import tqdm
import os
import numpy as np
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--input_file', type=str, help='input file path')
parser.add_argument('--topn', type=int, help='collect top n hits names and sim scores')
parser.add_argument('--output_file', type=str, help='output file path')
def calc_JI(set1, set2):
return len(set1.intersection(set2)) / len(set1.union(set2))
args = parser.parse_args()
input_file = args.input_file
output_file = args.output_file
df = pd.read_csv(input_file, sep='\t')
df.set_index('GO', inplace=True)
all_go = pd.read_csv('data/go_terms.csv', index_col=0)
# set the top n hits
n_hit = args.topn
# placeholders for p-values
df['pvals'] = None
df['adj_pvals'] = None
df['random_pvals'] = None
df['random_adj_pvals'] = None
df['JI'] = 0
df['random JI'] = 0
M = len(set(all_go['Genes'].str.split(' ').sum())) # total number of genes in all GO terms
all_pvals_dict = {}
pvals_index = {}
rand_pvals = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
current_gene_set = row['Genes'].split(' ')
# print(current_gene_set)
top_n_hits = row[f'top_{n_hit}_hits'].split('|')
# print(len(top_n_hits))
pvals_list = []
ji_list = []
for j, term in enumerate(top_n_hits):
genes = all_go[all_go['Term_Description'] == term]['Genes'].tolist()[0].split(' ')
# Calculate Jaccard Index
ji = calc_JI(set(current_gene_set), set(genes))
ji_list.append(ji)
# print(genes)
x = len(set(current_gene_set).intersection(set(genes))) # number of genes in the intersection
n = len(set(current_gene_set)) # total number of genes in the true gene set
N = len(set(genes)) # total number of genes in this query gene set
# hypergeometric test of the gene set
pval = hypergeom.sf(x-1, M, n, N)
pvals_list.append(pval)
# Store the p-values by j
if j in all_pvals_dict:
all_pvals_dict[j].append(pval)
else:
all_pvals_dict[j] = [pval]
# check nan in p
if np.isnan(pval):
print(f"NaN p-value for M={M}, n={n}, N={N}, x={x}")
print(i, j, term)
# Store the position of each p-value by j for later reference
if j in pvals_index:
pvals_index[j].append((i, j))
else:
pvals_index[j] = [(i, j)]
# hypergeomatric test for the random
random_go_term = row['random_GO_name']
# get the genes in the random GO term
random_row_genes = all_go.loc[all_go['Term_Description']==random_go_term, 'Genes'].values[0].split(' ')
random_JI = calc_JI(set(current_gene_set), set(random_row_genes))
x_rand = len(set(current_gene_set).intersection(set(random_row_genes)))
N_rand = len(random_row_genes)
p_val_rand = hypergeom.sf(x_rand-1, M, n, N_rand)
rand_pvals.append(p_val_rand)
df.loc[i, 'random_pvals'] = p_val_rand
df.loc[i, 'random JI'] = random_JI
df.loc[i, 'pvals'] = '|'.join(str(pval) for pval in pvals_list)
df.loc[i, 'JI'] = '|'.join(str(ji) for ji in ji_list)
# Adjust the p-values for each j separately
for j in all_pvals_dict:
adj_pvals = multipletests(np.array(all_pvals_dict[j]), method='fdr_bh')[1]
# print(adj_pvals)
# Assign adjusted p-values back to the data frame
for adj_pval, (i, _) in zip(adj_pvals, pvals_index[j]):
if df.loc[i, 'adj_pvals']:
df.loc[i, 'adj_pvals'] += '|' + str(adj_pval)
else:
df.loc[i, 'adj_pvals'] = str(adj_pval)
rand_adj_pvals = multipletests(np.array(rand_pvals), method='fdr_bh')[1]
df['random_adj_pvals'] = rand_adj_pvals
df.to_csv(output_file, sep='\t', index=True)