-
Notifications
You must be signed in to change notification settings - Fork 0
/
randomforest_attempts.R
110 lines (92 loc) · 3.11 KB
/
randomforest_attempts.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Setup ----
library(tidyverse)
library(randomForest)
# dataset_version <- "FT350"
dataset_version <- "FT2040"
output_folder <- paste0("made_data_", dataset_version, "/")
features_extracted <- read_csv(paste0(output_folder, "features_extracted.csv")) %>%
mutate(sn=ifelse(is.infinite(sn), 0, sn))
# Using ALL features up to this point ----
feature_subset <- features_extracted %>%
select(-feature, -feat_class) %>%
data.matrix()
rfmodel <- randomForest(feature_subset, y=factor(features_extracted$feat_class),
importance=TRUE, proximity = TRUE, keep.forest = TRUE)
rfmodel
varImpPlot(rfmodel)
rf.mds <- MDSplot(rfmodel, factor(features_extracted$feat_class), k=3)
rf.mds$points %>%
as.data.frame() %>%
cbind(feat_class=features_extracted$feat_class) %>%
plotly::plot_ly(x=~`Dim 1`, y=~`Dim 2`, z=~`Dim 3`, color=~feat_class)
# features_extracted %>%
# select(-feat_id, -feat_class) %>%
# data.matrix() %>%
# partialPlot(x = rfmodel, x.var="med_SNR")
# Using the xcms defaults ----
features_xcms <- features_extracted %>%
select(mean_mz, sd_ppm, mean_rt, sd_rt, mean_pw, log_mean_height,
sn, f, scale, lmin, feat_npeaks, sd_pw) %>%
data.matrix()
rfmodel <- randomForest(features_xcms, y=factor(features_extracted$feat_class), importance=TRUE)
rfmodel
varImpPlot(rfmodel)
# Using and plotting the best 3 ----
rfmodel <- features_extracted %>%
select(med_cor, med_SNR, shape_cor) %>%
randomForest(y=factor(features_extracted$feat_class), importance=TRUE)
rfmodel
varImpPlot(rfmodel)
features_extracted %>%
plotly::plot_ly(x=~log10(1-med_cor), y=~med_SNR, z=~shape_cor, color=~feat_class,
mode="markers", type= "scatter3d")
# rpart?
library(rpart)
fit <- features_extracted %>%
select(-feat_id) %>%
select(mean_mz, sd_ppm, mean_rt, sd_rt, mean_pw, log_mean_height,
sn, f, lmin, scale, feat_npeaks, sd_pw, feat_class) %>%
rpart(formula = feat_class~.)
library(rpart.plot)
rpart.plot(fit)
# Normal dimensionality reductions
features_extracted %>%
column_to_rownames("feat_id") %>%
select(-feat_class) %>%
data.matrix() %>%
scale() %>%
prcomp() %>%
biplot()
features_extracted %>%
select(-feat_id, -feat_class) %>%
data.matrix() %>%
scale() %>%
prcomp() %>%
`$`("x") %>%
as.data.frame() %>%
cbind(feat_class=features_extracted$feat_class) %>%
ggplot() +
geom_point(aes(x=PC1, y=PC2, color=feat_class))
# plotly::plot_ly(x=~PC1, y=~PC2, z=~PC3, color=~feat_class,
# mode="markers", type="scatter3d")
# Proper test/train set ----
set.seed(123)
traintestlist <- features_extracted %>%
filter(feat_class%in%c("Good", "Bad")) %>%
slice_sample(n = nrow(.)) %>%
split(rbernoulli(nrow(.), 0.2)) %>%
setNames(c("train", "test"))
rfmodel <- traintestlist$train %>%
select(-feat_id, -feat_class) %>%
data.matrix() %>%
randomForest(y=factor(traintestlist$train$feat_class))
rfmodel
varImpPlot(rfmodel)
traintestlist$test %>%
select(-feat_id, -feat_class) %>%
data.matrix() %>%
predict(object=rfmodel) %>%
as.data.frame() %>%
setNames("pred_class") %>%
cbind(traintestlist$test) %>%
with(table(pred_class, feat_class))