-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Module-4-Overall-Example.R
87 lines (70 loc) · 2.64 KB
/
Module-4-Overall-Example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#
# The following code is implemented by Heather Shappell
#
# Set the working directory.
# setwd("C:\\Users\\hshap\\Documents\\CS 555 Assignments\\Teaching\\Week4")
#Read in data
library(xlsx)
# data<-read.xlsx("salary.xlsx",1)
data <- read.csv("Datasets/CEO_salary.csv")
data
#Correlation and Scatterplots - separately
cor(data$age,data$height)
cor(data$age,data$salary)
cor(data$height,data$salary)
cor(data)
#Scatterplot matrix - everything at once!
pairs(data)
##CODE TO MAKE THE PRETTY PLOT##
panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y, use="complete.obs"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2)
}
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
}
panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"),
cex = 1, col.smooth = "black", ...) {
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
abline(stats::lm(y ~ x), col = col.smooth, ...)
}
pairs(data,upper.panel=panel.cor, diag.panel=panel.hist, lower.panel=panel.lm)
#Multiple Linear Regression
m<-lm(data$salary~data$age+data$height)
summary(m)
confint(m, level=.99)
#ANOVA table by hand (if you are curious about determine quantities,
#NOT NECESSARY to do as F test is in the summary output - this is ONLY
##for comparison with by hand calculations in module)
#DO NOT USE THIS CODE BELOW FOR YOUR HOMEWORK
totalss <-sum((data$salary -mean(data$salary))*(data$salary-mean(data$salary)))
regss <-sum((fitted(m) -mean(data$salary))*(fitted(m) -mean(data$salary)))
residss <-sum((data$salary-fitted(m))*(data$salary-fitted(m)))
fstatistic<- (regss/2)/(residss/97)
pvalue <-1-pf(fstatistic,df1=2,df2=97)
#Confidence Intervals
confint(m, level =.99)
#Assessing fit/Regression Diagnostics
#Residual plots - Checking constant variance and linearity
plot(fitted(m),resid(m), axes=TRUE, frame.plot=TRUE, xlab = "fitted values", ylab="residuals")
abline(h=0)
plot(data$age,resid(m), axes=TRUE, frame.plot=TRUE, xlab = "age", ylab="residuals")
abline(h=0)
plot(data$height,resid(m), axes=TRUE, frame.plot=TRUE, xlab = "height", ylab="residuals")
abline(h=0)
#Checking Normality of residuals
hist(resid(m), breaks=2)