-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute_total_score.R
495 lines (353 loc) · 22.3 KB
/
compute_total_score.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
# Adam Berman
# Princeton University
# 5 May 2017
setwd("/Users/adamberman/metabolite-scorer")
library(futile.matrix)
library(reshape2)
library(KEGGREST)
library(plyr)
library(ROCR)
#----------Compute mutational score----------------------------------------------------------------
#----------Compute mutational score----------------------------------------------------------------
#----------Compute mutational score----------------------------------------------------------------
# Read in metabolites and their associated genes
# The 'kegg_metabolite_table.txt' file is created by pipelining the output of
# make_hmdb_metabolites.py into a text file, where make_hmdb_metabolites.py
# (as well as kegg_metabolite_table.txt itself) as can be found at: https://github.com/agberman/metabolite-scorer.git
metabolites <- read.table('kegg_metabolite_table.txt', header = TRUE, sep = '\t', quote = "", comment.char = "")
# Read in genes and their gene lengths
# The amino_acid_gene_lengths file can be found in the Github repository at
# the following link: https://github.com/agberman/metabolite-scorer.git
gene_lengths <- read.table('amino_acid_gene_lengths', header = TRUE, sep = '\t', quote = "", comment.char = "")
# Read in raw TCGA data
# BRCA_US_DATA.csv can be downloaded in the Github repo above, or at the
# following link: https://dcc.icgc.org/api/v1/download?fn=/current/Projects/BRCA-US/donor.BRCA-US.tsv.gz
samples <- read.csv('BRCA_US_DATA.csv')
# Sum together total number of mutations to each gene
gene_scores <- aggregate(samples$X..mutations.at.gene, by=list(gene_symbol=samples$gene_symbol), FUN=sum)
# Remove empty string from top of gene_scores and rename columns
gene_scores = gene_scores[-1,]
colnames(gene_scores) <- c('gene_symbol', 'total_mutations')
# Match genes to gene lengths
gene_scores$gene_length <- gene_lengths$LENGTH[match(gene_scores$gene_symbol, gene_lengths$Hugo_Symbol)]
# Remove genes with no gene length
gene_scores <- na.omit(gene_scores)
# Compute mutational gene subscores by dividing total mutations by gene length
gene_scores$mutational_gene_subscore <- (gene_scores$total_mutations / gene_scores$gene_length)
# Sort mutational gene subscores
gene_scores_sorted <- gene_scores[with(gene_scores, order(-mutational_gene_subscore)), ]
# Plot a histogram of mutational gene subscores
hist(gene_scores$mutational_gene_subscore, xlab = 'Mutational Gene Subscore', main = 'Distribution of Mutational Gene Subscores')
# Match mutational gene subscores to genes in metabolite list
metabolites$mutational_gene_subscore <- gene_scores$mutational_gene_subscore[match(metabolites$Gene.Name,
gene_scores$gene_symbol)]
# Remove rows where gene has no mutational gene subscore
metabolites <- na.omit(metabolites)
# Sort metabolites by decreasing mutational gene subscore
#metabolites_sorted <- metabolites[with(metabolites, order(-mutational_gene_subscore)), ]
# Find average of mutational gene subscores associated with each metabolite
metabolites_scored <- aggregate(mutational_gene_subscore~Metabolite, metabolites, FUN=mean)
# CLEAN UP: Rename columns and return KEGG IDs to each metabolite after aggregation, then reorder columns
colnames(metabolites_scored) <- c('metabolite', 'mutational_score')
metabolites_scored$KEGG_ID <- metabolites$KEGG[match(metabolites_scored$metabolite, metabolites$Metabolite)]
metabolites_scored <- metabolites_scored[,c(1,3,2)]
# Normalize mutational scores to be between 0 and 1
metabolites_scored$normalized_mutational_score <- ((metabolites_scored$mutational_score - min(metabolites_scored$mutational_score)) / (max(metabolites_scored$mutational_score) - min(metabolites_scored$mutational_score)))
# Sort metabolites by descending mutational score
metabolites_scored_sorted <- metabolites_scored[with(metabolites_scored, order(-mutational_score)), ]
# Copy results into new data frames
mutational_results <- metabolites_scored
mutational_results_sorted <- metabolites_scored_sorted
# Change column names
colnames(mutational_results) <- c("Metabolite", "KEGG_ID", "Mutation_Score", "Normalized_Score")
colnames(mutational_results_sorted) <- c("Metabolite", "KEGG_ID", "Mutation_Score", "Normalized_Score")
# Generate a histogram of the mutation scores
hist(mutational_results$normalized_mutational_score, xlab = 'Normalized Mutational Score', main = 'Distribution of Normalized Mutational Scores')
#----------Compute expressional score----------------------------------------------------------------
#----------Compute expressional score----------------------------------------------------------------
#----------Compute expressional score----------------------------------------------------------------
# Read raw data into a data frame, storing only the gene name, the expression value,
# and the indicator of whether a sample is from cancer tissue or healthy tissue
# "exp_array.BRCA-US.tsv", the raw expressional data, can be downloaded at the
# following link: https://dcc.icgc.org/api/v1/download?fn=/current/Projects/BRCA-US/exp_array.BRCA-US.tsv.gz
df = read.delim("exp_array.BRCA-US.tsv", colClasses = c("NULL","NULL","NULL","NULL","NULL","NULL","NULL",NA,NA,
"NULL","NULL","NULL","NULL","NULL","NULL",NA,"NULL"))
# Convert raw data into a matrix with one sample per row and one gene per column, where the value
# at a row-column intersection is the expression value of the column's gene in the row's sample
matrix <- acast(df, raw_data_accession ~ gene_id, value.var="normalized_expression_value")
row_names <- rownames(matrix)
abbrev_row_names <- data.frame(strsplit(row_names, "-"))[4,]
abbrev_row_names <- t(abbrev_row_names)
abbrev_row_names <- as.character(as.vector(abbrev_row_names[,1]))
# 0 indicates a cancer sample (row), 1 indicates a healthy sample (row)
bin_row_names <- as.numeric(substring(abbrev_row_names, 1, 1))
matrix_copy <- matrix
matrix_copy <- cbind(matrix_copy, bin_row_names)
# 1. Split matrix into cancer and healthy submatrices
cancer_matrix <- matrix_copy[matrix_copy[,17815] == 0, c(1:17814)]
healthy_matrix <- matrix_copy[matrix_copy[,17815] == 1, c(1:17814)]
# 2. Run t-test between each corresponding pair of rows in the two submatrices
# Define a function to run your t.test, grab the p value, and put them in a data.frame
f <- function(x,y){
test <- t.test(x, y)
#out <- data.frame(pval = test$p.value)
out <- test$p.value
return(out)
}
# Iterate over columns in cancer_matrix and healthy_matrix via sapply
p_vals <- sapply(seq(ncol(cancer_matrix)), function(x) f(cancer_matrix[,x], healthy_matrix[,x]))
# Adjust p-values for multiple hypothesis testing
p_vals <- p.adjust(p_vals, method = "bonferroni")
# Binarize p-values according to the threshold of 0.01
binarized_p_vals <- as.integer(p_vals < 0.01)
# Histogram of p_vals
hist(p_vals)
hist(binarized_p_vals)
bpv <- data.frame(colnames(matrix), binarized_p_vals)
colnames(bpv) <- c("gene_symbol", "binarized_p_val")
# Import metabolite data
metabolites <- read.table('kegg_metabolite_table.txt', header = TRUE, sep = '\t', quote = "", comment.char = "")
# Append binarized p-value of each gene in metabolites to metabolites
metabolites$binarized_p_val <- bpv$binarized_p_val[match(metabolites$Gene.Name, bpv$gene_symbol)]
# Remove rows with no binarized p-value
metabolites <- na.omit(metabolites)
# Count how many significantly mutated genes are associated with each metabolite
metabolites_with_bpv_sum <- aggregate(metabolites$binarized_p_val, by=list(metabolites$KEGG, metabolites$Metabolite), FUN=sum)
colnames(metabolites_with_bpv_sum) <- c("KEGG", "Metabolite", "bpv_sum")
# Count the number of genes associated with each metabolite
metabolites_with_alphas <- aggregate(cbind(num_genes = Gene.Name) ~ Metabolite,
data = metabolites,
FUN = function(x){NROW(x)})
# nrow(metabolites_with_alphas[which(metabolites_with_alphas$num_genes == 1),])
# Create a metabolite-per-row data frame associating BPV totals with number of genes associated with each metabolite
expressional_scores <- data.frame(metabolites_with_bpv_sum$Metabolite, metabolites_with_bpv_sum$KEGG, metabolites_with_bpv_sum$bpv_sum, metabolites_with_alphas$num_genes)
colnames(expressional_scores) <- c("Metabolite", "KEGG", "bpv_sum", "num_genes")
# Compute expressional scores
expressional_scores$Expression_Score <- expressional_scores$bpv_sum / expressional_scores$num_genes
# Normalize expressional scores to be between 0 and 1
expressional_scores$Normalized_Score <- ((expressional_scores$Expression_Score - min(expressional_scores$Expression_Score)) / (max(expressional_scores$Expression_Score) - min(expressional_scores$Expression_Score)))
# Remove metabolites with fewer than 4 associated genes
expressional_scores_trimmed <- expressional_scores[expressional_scores$num_genes >= 4, ]
# Re-normalize
expressional_scores_trimmed$Normalized_Score <- ((expressional_scores_trimmed$Expression_Score - min(expressional_scores_trimmed$Expression_Score)) / (max(expressional_scores_trimmed$Expression_Score) - min(expressional_scores_trimmed$Expression_Score)))
# Sort metabolites by expressional scores
expressional_scores_sorted <- expressional_scores_trimmed[with(expressional_scores_trimmed, order(-Expression_Score, -bpv_sum)), ]
# Copy results into new data frames
expressional_results <- expressional_scores_trimmed
expressional_results_sorted <- expressional_scores_sorted
# Modify column names
colnames(expressional_results) <- c("Metabolite", "KEGG_ID", "bpv_sum", "num_genes", "Expression_Score", "Normalized_Score")
colnames(expressional_results_sorted) <- c("Metabolite", "KEGG_ID", "bpv_sum", "num_genes", "Expression_Score", "Normalized_Score")
# Generate a histogram of the mutation scores
hist(expressional_results$Normalized_Score, xlab = 'Normalized Expressional Score', main = 'Distribution of Normalized Expressional Scores')
#----------Retrieve KEGG data----------------------------------------------------------------
#----------Retrieve KEGG data----------------------------------------------------------------
#----------Retrieve KEGG data----------------------------------------------------------------
# Read in raw KEGG data: all metabolic pathways found in humans ("hsa")
paths <- names(keggList("pathway", "hsa"))
# Initialize vectors to hold pathways and their constituent
# metabolites (paths_with_metabolites) and metabolite names (names)
paths_with_metabolites <- vector(mode="list", length=length(paths))
names <- vector(mode="character", length=length(paths))
# Initialize lists to hold metabolites found in all pathways (met_pool)
# and metablites found only in cancer-linked pathways (cancer_pool)
met_pool <- character()
cancer_pool <- character()
# Iterate over all pathways
for (i in 1:length(paths)){
print(paste("i:", i))
# Parse out all metabolites in given pathway
dat <- keggGet(paths[i])
names[i] <- dat[[1]]["NAME"][[1]]
metabolites <- names(dat[[1]]["COMPOUND"][[1]])
# Add all metabolites to met_pool
met_pool <- c(met_pool, metabolites)
# Add metabolites to cancer_pool if pathway is cancer-linked
if ((i >= 279) && (i <= 301)) {
cancer_pool <- c(cancer_pool, metabolites)
}
# Update paths_with_metabolites with new pathway
paths_with_metabolites[[i]] <- metabolites
}
# Associate names with the stored pathways
pwp <- paths_with_metabolites
names(pwp) <- names
# Remove repeated metabolites in cancer_pool
cancer_pool <- unique(cancer_pool)
#----------Compute total scores at all alphas from 0 to 1 by 0.05----------------------------------------------------------------
#----------Compute total scores at all alphas from 0 to 1 by 0.05----------------------------------------------------------------
#----------Compute total scores at all alphas from 0 to 1 by 0.05----------------------------------------------------------------
#----------total_score = (mutation_score*alpha) + (expression_score*(1-alpha))---------------------------------------------------
# Remove all metabolites without a KEDD ID
mutation_scores <- mutational_results[mutational_results$KEGG_ID != "None", ]
expression_scores <- expressional_results[expressional_results$KEGG_ID != "None", ]
# Remove all metabolites that are not present in both the mutational and expressional results
# (we trimmed metabolites with fewer than four associated metabolites in the expressional list)
mutation_scores$expressional_score <- expression_scores$Expression_Score[match(mutation_scores$Metabolite,
expression_scores$Metabolite)]
mutation_scores <- na.omit(mutation_scores)
mutation_scores <- mutation_scores[, -5]
# Remove all metabolites not present in KEGG pathways
mutation_scores <- mutation_scores[mutation_scores$KEGG_ID %in% met_pool, ]
expression_scores <- expression_scores[expression_scores$KEGG_ID %in% met_pool, ]
# Remove last three rows of mutation and expression tables (513 rows -> 510 rows)
mutation_scores <- head(mutation_scores, -3)
expression_scores <- head(expression_scores, -3)
# Re-normalize scores
mutation_scores$Normalized_Score <- ((mutation_scores$Mutation_Score - min(mutation_scores$Mutation_Score)) / (max(mutation_scores$Mutation_Score) - min(mutation_scores$Mutation_Score)))
expression_scores$Normalized_Score <- ((expression_scores$Expression_Score - min(expression_scores$Expression_Score)) / (max(expression_scores$Expression_Score) - min(expression_scores$Expression_Score)))
# Verify that mutation_scores and expression_scores align perfectly
all(mutation_scores$Metabolite == expression_scores$Metabolite)
# Create matrix to store total scores
#alphas <- c(0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1)
# Create sequence of all alphas from 0 to 1 by 0.01
alphas <- seq(from = 0, to = 1, by = 0.01)
# Create matrix for storing overall scores at all alpha values
total_scores <- matrix(data = NA, nrow = nrow(mutation_scores), ncol = length(alphas))
rownames(total_scores) <- mutation_scores$KEGG_ID
colnames(total_scores) <- alphas
# Calculate overall scores based on (mutation_score*alpha) + (expression_score*(1-alpha))
# for all alphas 0-1 by 0.01 using normalized scores
for(column in 1:ncol(total_scores)){
alpha <- as.numeric(colnames(total_scores)[column])
total_scores[, column] <- ((mutation_scores$Normalized_Score * alpha)
+ (expression_scores$Normalized_Score * (1 - alpha)))
}
#----------Perform machine learning to identify best alpha value----------------------------------------------
#----------Perform machine learning to identify best alpha value----------------------------------------------
#----------Perform machine learning to identify best alpha value----------------------------------------------
# Initialize list to store, for each metabolite, a 1 if the metabolite
# is present in a KEGG cancer-linked pathway, or a 0 if not
in_cancer_pool <- c()
# Iterate through all metabolites in the overall score matrix
for (i in 1:length(rownames(total_scores))) {
if (rownames(total_scores)[i] %in% cancer_pool) {
# Mark a 1 in the list if the metabolite appears in a KEGG cancer-linked pathway
in_cancer_pool = c(in_cancer_pool, 1)
} else {
# Mark a 0 in the list if the metabolite does not appear in a KEGG cancer-linked pathway
in_cancer_pool = c(in_cancer_pool, 0)
}
}
# Add the list of 1's and 0's for each metabolite to the overall score matrix
total_scores <- cbind(total_scores, in_cancer_pool)
# Randomly permute data row-wise
total_scores_random <- total_scores[sample(nrow(total_scores)),]
# Split random data into 5 training sets and 5 hold out sets
set_size <- (nrow(total_scores_random) / 5)
set1 <- total_scores_random[(set_size+1):nrow(total_scores_random), ]
set1_holdout <- total_scores_random[1:set_size, ]
set2 <- total_scores_random[1:set_size, ]
set2 <- rbind(set2, total_scores_random[((2*set_size)+1):nrow(total_scores_random), ])
set2_holdout <- total_scores_random[(set_size+1):(2*set_size), ]
set3 <- total_scores_random[1:(2*set_size), ]
set3 <- rbind(set3, total_scores_random[((3*set_size)+1):nrow(total_scores_random), ])
set3_holdout <- total_scores_random[((2*set_size)+1):(3*set_size), ]
set4 <- total_scores_random[1:(3*set_size), ]
set4 <- rbind(set4, total_scores_random[((4*set_size)+1):nrow(total_scores_random), ])
set4_holdout <- total_scores_random[((3*set_size)+1):(4*set_size), ]
set5 <- total_scores_random[1:(4*set_size), ]
set5_holdout <- total_scores_random[((4*set_size)+1):nrow(total_scores_random), ]
sets <- list(set1, set2, set3, set4, set5)
holdout_sets <- list(set1_holdout, set2_holdout, set3_holdout, set4_holdout, set5_holdout)
# Run machine learning simulation on all training set-holdout set pairs,
# computing a maximum alpha and corresponding maximum holdout AUC for each
alpha_maxes <- c()
final_aucs <- c()
i = 0
for (set in sets) {
i = i + 1
auc_vals <- matrix(data = NA, nrow = 1, ncol = length(alphas))
rownames(auc_vals) <- c("AUC_Values")
colnames(auc_vals) <- alphas
for(column in 1:(ncol(total_scores_random)-1)) {
prediction.obj <- prediction(set[, column], set[, "in_cancer_pool"])
perf <- performance(prediction.obj, measure = 'auc')
auc <- perf@y.values[[1]]
auc_vals[1, column] <- auc
}
alpha_max <- colnames(auc_vals)[apply(auc_vals,1,which.max)]
alpha_maxes <- c(alpha_maxes, as.numeric(alpha_max))
holdout_pred <- prediction(holdout_sets[[i]][, alpha_max], holdout_sets[[i]][, "in_cancer_pool"])
holdout_perf <- performance(holdout_pred, measure = 'auc')
holdout_auc <- holdout_perf@y.values[[1]]
final_aucs <- c(final_aucs, holdout_auc)
}
# Set final alphas to be those found in my writeup (due to the random nature
# of the set divisions, alpha values will vary each time the code is run)
# final_alpha <- mean(c(1.0, 1.0, 0.13, 1.0, 1.0))
# Compute average of alpha maxes (i.e. the optimal alpha)
final_alpha <- mean(alpha_maxes)
# Compute the average of the final AUC values
final_auc <- mean(final_aucs)
# Create a box-and-whiskers plot of the final AUC values
boxplot(final_aucs, main="Final Area Under ROC Curve (AUC) Values")
#----------Compute total scores at best alpha for all metabolites (even those without a KEGG ID)---------------------------------
#----------Compute total scores at best alpha for all metabolites (even those without a KEGG ID)---------------------------------
#----------Compute total scores at best alpha for all metabolites (even those without a KEGG ID)---------------------------------
#----------total_score = (mutation_score*alpha) + (expression_score*(1-alpha))---------------------------------------------------
# Remove all metabolites that are not present in both the mutational and expressional results
# (we trimmed metabolites with fewer than four associated metabolites in the expressional list)
mutational_results$expressional_score <- expressional_results$Expression_Score[match(mutational_results$Metabolite, expressional_results$Metabolite)]
mutational_results_trimmed <- na.omit(mutational_results)
mutational_results_trimmed <- mutational_results_trimmed[, -5]
# Raw (unweighted) sum of normalized mutation and expression scores
total_results <- data.frame(Metabolite = mutational_results_trimmed$Metabolite,
KEGG_ID = mutational_results_trimmed$KEGG_ID,
Normalized_Mutation_Score = mutational_results_trimmed$Normalized_Score,
Normalized_Expression_Score = expressional_results$Normalized_Score,
Total_Score = (((mutational_results_trimmed$Normalized_Score)*final_alpha) + ((expressional_results$Normalized_Score)*(1 - final_alpha))))
# Compute normalized total score
total_results$Normalized_Total_Score <- ((total_results$Total_Score - min(total_results$Total_Score)) / (max(total_results$Total_Score) - min(total_results$Total_Score)))
# Sort by total scores
total_results_sorted <- total_results[with(total_results, order(-Total_Score)), ]
# Print the 20 metabolites with highest normalized
# overall scores at the optimal alpha value
head(total_results_sorted, 20)
# Generate a histogram of normalized total scores, computed using the
# best alpha value as computed by ROC machine learning techniques
hist(total_results$Normalized_Total_Score, xlab = 'Normalized Overall Score', main = 'Distribution of NormalizedOverall Scores at Alpha=0.01')
#----------Compute final p-values using Hypergeometric test----------------------------------------------------------------
#----------Compute final p-values using Hypergeometric test----------------------------------------------------------------
#----------Compute final p-values using Hypergeometric test----------------------------------------------------------------
# This test is optional
'''
# Random sampling function
sample_pval <- function(sample_size, threshold, num_samples, met_pool, cancer_pool) {
samples <- replicate(num_samples, sample(met_pool, sample_size, replace = TRUE))
num_ge_threshold = 0
for (i in 1:num_samples) {
ge_threshold_count <- length(intersect(cancer_pool, samples[,i]))
if (ge_threshold_count >= threshold) {
num_ge_threshold <- num_ge_threshold + 1
}
}
return(num_ge_threshold/num_samples)
}
# Define quantiles to be tested
quantiles <- c(1, 5, 10, 15, 20)
#alphas <- as.numeric(colnames(total_scores))
alphas <- seq(from = 0, to = 1, by = 0.01)
# Initialize data structure to hold all final p-values
final_pvals <- matrix(data = NA, nrow = length(alphas), ncol = length(quantiles))
rownames(final_pvals) <- alphas
colnames(final_pvals) <- quantiles
# Calculate all p-values (one per alpha per quantile)
for (a in 1:length(alphas)) {
alph <- total_scores[,a]
for (q in 1:length(quantiles)) {
# Get metabolites with score in desired quantile
cuttoff <- quantile(alph, prob=1-quantiles[q]/100)
in_quantile <- names(alph[alph >= cuttoff])
sample_size <- length(in_quantile)
# Find out how many metabolites in quantile are in KEGG cancer pool
threshold <- length(intersect(cancer_pool, in_quantile))
# Calculate p-value via sampling and add it to final table
num_samples <- 1000
final_pvals[a,q] <- sample_pval(sample_size, threshold, num_samples, met_pool, cancer_pool)
}
}
final_pvals_df <- data.frame(final_pvals)
final_pvals_df$avg <- rowMeans(final_pvals_df)
final_pvals_df[final_pvals_df$avg == min(final_pvals_df$avg)]
'''