-
Notifications
You must be signed in to change notification settings - Fork 0
/
final_all_linear.R
104 lines (86 loc) · 3.52 KB
/
final_all_linear.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
load("DOC_baseFlow_weightedAvg_and_predictors.RData")
data <- DOC_baseFlow_weightedAvg_and_predictors
y <- data[,"meanDOC"]
set.seed(0)
## create a function to calculate r^2
rsqr <- function(prediction, observed){
residuals <- observed - prediction
SSR <- sum(residuals ** 2)
mean_doc <- mean(observed)
SST <- sum((observed - mean_doc) ** 2)
rsqr <- 1 - (SSR/SST)
}
## Make a testing/training set
n <- nrow(data)
p <- sample(1:n)
train = data[p[1:(n/2)], ]
test = data[p[((n/2)+1):n], ]
## all vars linear model (probably the bad one)
allm <- lm(meanDOC ~ ., data=train)
summary(allm)
## r^2 of 0.6587, but as expected, there are a lot of variables that we have to
#remove according to significance alone
params <- colnames(data)
params <- params[
! params %in% c(
"newLocationID", "Year", "Longitude",
"MPB_greenFraction", "MPB_redFraction", "MPB_grayFraction",
"landCover_agriculturalFraction", "minTemp_baseFlow",
"minTemp_nonBaseFlow", "minTemp_yearAvg", "maxTemp_baseFlow",
"maxTemp_nonBaseFlow", "maxTemp_yearAvg", "precip_baseFlow",
"precip_yearAvg", "soil_Afraction", "soil_Cfraction",
"SWE_baseFlow", "SWE_nonBaseFlow", "SWE_yearAvg"
)
]
## now we rerun the model on only the initially significant variables
## we will cross-validate using training/testing, then try to eliminate
## more variables using background information
signifm <- lm(meanDOC ~ ., data=train[, params])
summary(signifm)
## r^2 of 0.6417 on training
test_pred_sig <- predict(signifm,newdata=test[, params])
rsqr_test_sig<- rsqr(test_pred_sig, test$meanDOC)
## r^2 is 0.5375 on testing data,
## training set r^2 is much higher than testing set r^2, which
## means the model is overfitted
#now we will try to remove variables that are both insignificant AND
#that we feel aren't necessary by reading the background info
params <- colnames(data)
params <- params[
! params %in% c(
"newLocationID", "Year", "Longitude", "Latitude",
"fire_normRdNBR_5yDecay", "fire_normRdNBR_15yDecay",
"aspect_predominant",
"precip_nonBaseFlow", "landCover_forestFraction",
"MPB_greenFraction", "MPB_redFraction", "MPB_grayFraction",
"landCover_agriculturalFraction", "minTemp_baseFlow",
"minTemp_nonBaseFlow", "minTemp_yearAvg", "maxTemp_baseFlow",
"maxTemp_nonBaseFlow", "maxTemp_yearAvg", "precip_baseFlow",
"precip_yearAvg", "soil_Afraction", "soil_Cfraction",
"SWE_baseFlow", "SWE_nonBaseFlow", "SWE_yearAvg"
)
]
## What we removed:
## aspect_predominant,
## two of the fire_norms (now only using the 10 year average),
## removing forest fraction and keeping land fraction (add to equal 1)
## precipitation_nonBaseFlow and Latitude (seemingly unneeded)
# we will also be standardizing the variables at this point, to try and get
# a clearer model
cutdata <- data[, params]
cuttrain = cutdata[p[1:(n/2)], ]
cuttest = cutdata[p[((n/2)+1):n], ]
dep_ndx <- which(colnames(cutdata) == 'meanDOC')
for (v in 1:ncol(cutdata)) {
if (v == dep_ndx) { next }
s <- sd(cuttrain[,v])
cuttrain[,v] <- cuttrain[,v] / s
cuttest[,v] <- cuttest[,v] / s
}
cutm <- lm(meanDOC ~ ., data=cuttrain)
summary(cutm)
test_pred_cut <- predict(cutm,newdata=cuttest)
(rsqr_test_cut<- rsqr(test_pred_cut, cuttest$meanDOC))
##overall it seems the fire_norm is inconsistent in whether or not it is useful.
##r^2 for this model is 0.6148, and still 0.5685 for the testing, meaning the
##model is still slightly overfitted, but better than the all variables.