generated from PNNL-CompBio/p3
-
Notifications
You must be signed in to change notification settings - Fork 3
/
predicting_mutation_helper.R
106 lines (84 loc) · 2.93 KB
/
predicting_mutation_helper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
library(glmnet)
library(groupdata2)
library(dplyr)
source("../util/synapseUtil.R")
source("../util/loading_data.R")
source("../util/mutational_analysis_helper.R")
source("../util/make_plots_util.R")
## This script loads all the data needed for the modeling scripts. We also
## normalize the train and test matrices of for global, phospho and RNA data.
## This way they are ready for the follow up analysis scripts.
syn <- synapseLogin()
# load.combined.data()
load("../Misc/load.combined.data 3-09-2022.RData")
all_samples <- global.data %>%
pull(Barcode.ID) %>% unique()
dummy_mutation <- data.frame(Barcode.ID = all_samples,
Gene = "dummy")
mutation_df <- load_mutational_sample_data()
mutation_mat <- mutation_df %>%
rbind(dummy_mutation) %>%
unique() %>%
group_by(Gene) %>%
mutate(total = n()) %>%
ungroup(Gene) %>%
filter(total > 3,
Gene != "NPM1") %>%
mutate(dummy = "TRUE") %>%
pivot_wider(-total, names_from = "Gene",
values_from = "dummy", values_fill = "FALSE") %>%
select(-dummy) %>%
as.data.frame()
## filtering for complete features
global_features <- global.data %>%
group_by(Gene) %>%
summarize(total = n()) %>%
filter(total == 210) %>%
pull(Gene)
phospho_features <- phospho.data %>%
group_by(SiteID) %>%
summarize(total = n()) %>%
filter(total == 210) %>%
pull(SiteID)
RNA_features <- RNA.data %>%
group_by(Gene) %>%
summarize(total = n()) %>%
filter(total == 159) %>%
pull(Gene)
## Splitting data into train and test.
## Global
global_mat <- global.data %>%
dplyr::rename(feature = Gene) %>%
filter(feature %in% global_features) %>%
pivot_wider(names_from = Barcode.ID,
values_from = LogRatio) %>%
as.data.frame()
rownames(global_mat) <- global_mat$feature
global_mat <- global_mat[, -1] %>% as.matrix()
## Phospho
phospho_mat <- phospho.data %>%
select(Barcode.ID, SiteID, LogRatio) %>%
dplyr::rename(feature = SiteID) %>%
filter(feature %in% phospho_features) %>%
pivot_wider(names_from = Barcode.ID,
values_from = LogRatio) %>%
as.data.frame()
rownames(phospho_mat) <- phospho_mat$feature
phospho_mat <- phospho_mat[, -1] %>% as.matrix()
## RNA
RNA_mat <- RNA.data %>%
dplyr::rename(feature = Gene,
LogRatio = `RNA counts`) %>%
filter(feature %in% RNA_features) %>%
pivot_wider(names_from = Barcode.ID,
values_from = LogRatio) %>%
as.data.frame()
rownames(RNA_mat) <- RNA_mat$feature
RNA_mat <- RNA_mat[, -1] %>% as.matrix()
## Standardizing
global_mat <- sweep(global_mat, 1, apply(global_mat, 1, mean), FUN = '-')
global_mat <- sweep(global_mat, 1, apply(global_mat, 1, sd), FUN = '/')
phospho_mat <- sweep(phospho_mat, 1, apply(phospho_mat, 1, mean), FUN = '-')
phospho_mat <- sweep(phospho_mat, 1, apply(phospho_mat, 1, sd), FUN = '/')
RNA_mat <- sweep(RNA_mat, 1, apply(RNA_mat, 1, mean), FUN = '-')
RNA_mat <- sweep(RNA_mat, 1, apply(RNA_mat, 1, sd), FUN = '/')