-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinearRegression.R
97 lines (77 loc) · 2.84 KB
/
LinearRegression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Importing package
library(ISLR)
library(ggcorrplot)
# Loading dataset
auto_data<- Auto
dim(auto_data)
head(auto_data,10)
str(auto_data)
summary(auto_data)
# Perform Exploratory Data Analysis(EDA)
uniq_year <- unique(auto_data$year) # categorical variable starting 70 to 82
uniq_origin <- unique(auto_data$origin) # categorical variable: 1 2 3
uniq_cylinders<- unique(auto_data$cylinders) # categorical variable: 3 4 5 6 8
dim(auto_data) # (dim:392 9) Data has a skinny shape.
boxplot(mpg~origin,
data=auto_data,
main="Boxplots for each origin",
xlab="Origin",
ylab="mpg",
col="red",
border="darkblue"
)
boxplot(mpg~year,
data=auto_data,
main="Boxplots for each year",
xlab="Origin",
ylab="mpg",
col="orange",
border="darkblue"
)
boxplot(mpg~cylinders,
data=auto_data,
main="Boxplots for each year",
xlab="Origin",
ylab="mpg",
col="orange",
border="darkblue"
)
res.aov <- aov(mpg ~ year, data = auto_data)
summary(res.aov)
res.aov <- aov(mpg ~ origin, data = auto_data)
summary(res.aov)
res.aov <- aov(mpg ~ cylinders, data = auto_data)
summary(res.aov)
# Check whether there is a strong association between features and output
ggplot(auto_data, aes(x=displacement, y=mpg)) + geom_point()
ggplot(auto_data, aes(x=horsepower, y=mpg)) + geom_point()
ggplot(auto_data, aes(x=weight, y=mpg)) + geom_point()
ggplot(auto_data, aes(x=acceleration, y=mpg)) + geom_point() # there is much randomness than other attributes
# Check correlation between numerical values to understand relationship between indepedent and dependent variable
auto_cor <- cor(auto_data[1:6])
ggcorrplot(auto_cor, method = "circle",
type="lower",hc.order = TRUE,title="Correlation between features in Auto dataset")
# Apply linear regression model
linearMod1<- lm(mpg ~ cylinders+displacement+horsepower+weight+acceleration+year+origin, data=auto_data) # build linear regression model on full data
summary(linearMod1)
linearMod1$coefficients
linearMod1$fitted.values
plot(linearMod1)
# Apply new model(Convert mpg as log function)
mpg_log<-log(auto_data$mpg)
auto_log<- auto_data
auto_log$mpg_log<- mpg_log
auto_log$mpg <- NULL
head(auto_log)
linearMod2 <- lm(mpg_log ~ cylinders+displacement+horsepower+weight+acceleration+year+origin, data=auto_data) # build new linear regression model on full data
linearMod2$coefficients
summary(linearMod2)
ggplot(auto_log, aes(x=horsepower, y=mpg_log)) + geom_point()
ggplot(auto_log, aes(x=weight, y=mpg_log)) + geom_point()
ggplot(auto_log, aes(x=displacement, y=mpg_log)) + geom_point()
# Final model
linearMod3 <- lm(mpg_log ~ cylinders+displacement+horsepower+weight+year+origin, data=auto_data) # build new linear regression model on full data
linearMod3$coefficients
summary(linearMod3)
# Optional Part
plot(linearMod3)