forked from gulermalaria/metabolomics
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelper_functions_met_methods.R
201 lines (181 loc) · 9.65 KB
/
helper_functions_met_methods.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
library(tidyverse)
library(knitr)
library(reshape2)
library(pracma)
library(grid)
library(randomForest)
library(vegan)
library(gridExtra)
library(RSvgDevice)
library(extrafont)
base_directory = "/Users/maureencarey/local_documents/work/metabolomics_methods/data/" # REPLACE IF DATA SAVED ELSEWHERE
# preprocess dataset function to remove undetected mets
remove_NAs = function(raw) { # raw = raw file, unprocessed
# if entire row is nas, remove
raw_naR = as.matrix(raw[rowSums(is.na(raw))!=(dim(raw)[2]), ]) #dim = 375 17
return(raw_naR) }
# preprocess dataset function to imput missing values
imputing_missing = function(data_scaled, scaled) { # data = raw, raw_naR, or data_scaled file, scaled = 1/0 for if scaled or not
remove_NAs = data_scaled #replace NAs with minimum metabolite value
for (i in 1:nrow(remove_NAs)) {
min1 = min(remove_NAs[i,],na.rm= T)
if (scaled == 1) { min2 = min1 } else if (scaled == 0) { min2 = min1/2
} else {print('scaled must be a binary 1/0 for if data is already scaled')}
for (j in 1:ncol(remove_NAs)) {
if (is.na(remove_NAs[i,j])) { remove_NAs[i,j] = min2 } } }
return(remove_NAs) }
# Function for generating multipanel plots
multiplot = function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# see http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
plots = c(list(...), plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout = matrix(seq(1, cols * ceiling(numPlots/cols)),ncol = cols,
nrow = ceiling(numPlots/cols)) }
if (numPlots==1) { print(plots[[1]]) }
else { # Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx = as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col)) } } }
# Change rows in raw dataset
prep_raw = function(raw) {
rownames(raw) = gsub(" ",'',raw[,1]) # add row names (metabolites), removing spaces from strings
raw = raw[,2:dim(raw)[2]] # remove column that was moved to row names
return(raw) }
# Filter sample identifiers by what is in dataset
prep_sample_identifiers = function(raw) {
# open file containing sample measures
identifiers = read.csv(paste(base_directory,"sample_data_2017.csv",sep = ""), sep = ",", header = F,
na.strings=c('',"NA"), stringsAsFactors = F)
rownames(identifiers) = identifiers$V1 # add row names
identifiers$V1 = NULL # remove column that was moved to row names
identifiers = identifiers[,which(identifiers[3,]%in%colnames(raw))] # keep samples in raw
return(identifiers) }
# Normalize metabolite abundances by sample measures (dna, protein abundance, or parasite number)
normalize = function(input,identifiers_file,type) {
# input = input file (matrix) with rows as mets and columns as samples
# identifiers_file = sample data with rows as sample measurements, columns as samples
# type = type of normalization (dna, protein, para)
if (strcmp(type, "dna")) {
x = 6
} else if (strcmp(type, "protein")) {
x = 5
} else if (strcmp(type, "para")) {
x = 9
} else { print("ERROR, WRONG TYPE INPUT")}
pre_norm1 = input
for (i in 1:dim(pre_norm1)[1]) { # divide each metabolite abundance by it's sample value (dna, protein, or parasite #)
for (j in 1:dim(pre_norm1)[2]) {
(pre_norm1[i,j] = (as.numeric(pre_norm1[i,j])/as.numeric(identifiers_file[x,j]))) }}
norm_file = pre_norm1
return(norm_file) }
## remove characters from colnames if they will throw errors in classifiers
# use always for consistency
remove_characters_from_colnames = function(dataframe) {
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[.]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[(]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[)]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[']", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[)]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[:]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[;]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[[]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[]]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[*]", "_predicted", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[/]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("-", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("+", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[+]", "", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("[_][_]", "_", x) })
colnames(dataframe) = lapply(colnames(dataframe), function(x) { gsub("__", "_", x) })
for (i in 1:ncol(dataframe)) { # MAKE SURE STRINGS DONT START WITH NUMBERS
st = substr(colnames(dataframe)[i],0,1) # first character in string
n_st = as.numeric(st)
if (!is.na(n_st)) { colnames(dataframe)[i] = paste('N',colnames(dataframe)[i], sep = '')} }
return(dataframe)
}
custom_theme_pca = theme(
plot.title = element_text(colour="black",size=9, family = "Helvetica"),
axis.text.x = element_text(colour="black",size=6,angle=0,hjust=.5,vjust=.5,face="plain", family = "Helvetica"),
axis.text.y = element_text(colour="black",size=6,angle=0,hjust=1,vjust=0,face="plain",family = "Helvetica"),
axis.title.x = element_text(colour="black",size=9,angle=0,hjust=.5,vjust=0,face="plain", family = "Helvetica"),
axis.title.y = element_text(colour="black",size=9,angle=90,hjust=.5,vjust=.5,face="plain",family = "Helvetica"),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(fill = "NA", colour = "black"),
axis.line = element_blank(),
panel.spacing.x = unit(2, "lines"),
legend.title=element_blank())
custom_theme2 = theme(
axis.text.x = element_text(colour="black",size=6, family = "Helvetica"),
axis.text.y = element_blank(), axis.title.y = element_blank(),
axis.ticks.y = element_blank(), axis.ticks.x = element_blank(),
axis.title.x = element_text(colour="black",size=9, family = "Helvetica"),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_blank(),
panel.border = element_rect(fill = "NA", colour = "black"),
axis.line = element_blank(),
panel.spacing.x = unit(2, "lines"),
plot.margin = unit(c(5,5,5,5),"mm"))
custom_theme3 = theme(
plot.title = element_text(size = 9, family = "Helvetica"),
axis.text.x = element_text(colour="black",size=6,angle=0,hjust=.5,vjust=.5,face="plain", family = "Helvetica"),
axis.text.y = element_text(colour="black",size=6,angle=0,hjust=1,vjust=0,face="plain", family = "Helvetica"),
axis.title.x = element_text(colour="black",size=9,angle=0,hjust=.5,vjust=0,face="plain", family = "Helvetica"),
axis.title.y = element_text(colour="black",size=9,angle=90,hjust=.5,vjust=.5,face="plain", family = "Helvetica"),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_blank(),
panel.border = element_rect(fill = "NA", colour = "black"),
axis.line = element_blank(),
panel.spacing.x = unit(2, "lines"))
custom_theme4 = theme(
plot.title = element_text(colour="black",size=10, family = "Helvetica"),
axis.text.x = element_text(colour="black",size=8,angle=0,hjust=.5,vjust=.5,face="plain", family = "Helvetica"),
axis.text.y = element_text(colour="black",size=8,angle=0,hjust=1,vjust=0,face="plain",family = "Helvetica"),
axis.title.x = element_text(colour="black",size=10,angle=0,hjust=.5,vjust=0,face="plain", family = "Helvetica"),
axis.title.y = element_text(colour="black",size=10,angle=90,hjust=.5,vjust=.5,face="plain",family = "Helvetica"),
legend.text = element_text(colour="black",size=8),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(fill = "NA", colour = "black"),
axis.line = element_blank(),
panel.spacing.x = unit(2, "lines"),
legend.title=element_blank(),
plot.margin = unit(c(20,2,2,2),"mm"))
median_center_scale_data <- function(x) {
# x is matrix or dataframe with rows as samples and columns as mets
sd_x = apply(x,2,sd,na.rm=TRUE)
x_centered = apply(x, 2, function(y) (y - median(y,na.rm=TRUE)))
x_centered_scaled = sweep(x_centered, 2, sd_x, "/")
indx = setdiff((which(is.na(x_centered_scaled))),which(is.na(x_centered)))
if (any(is.na(x_centered_scaled))) {
x_centered_scaled[indx] = 0}
if (any(is.na(x_centered_scaled))) { warning('sd == 0')}
return(x_centered_scaled)}
# make df values numeric instead of character or factor
make_numeric = function(df) { # doesn't really matter what columns or rows are
indx = sapply(df, is.factor)
df[indx] = lapply(df[indx], function(x) as.character(x))
indx = sapply(df, is.character)
df[indx] = lapply(df[indx], function(x) as.numeric(x))
return(df) }
rM = base::rowMeans
## get percent variation summarized by pca
get_1st = function(input_pca) {
# class(input_pca) = "prcomp"
temp = summary(input_pca)
temp2 = temp$importance
return(round(temp2[2,1]*100, digits = 2))
}
get_2nd = function(input_pca) {
temp = summary(input_pca)
temp2 = temp$importance
return(round(temp2[2,2]*100, digits = 2))
}