title | author | date | output | ||||
---|---|---|---|---|---|---|---|
Clustering and Factor Analysis - Microvan Research |
Nammn Joshii |
14/10/2020 |
|
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(gmodels)
library(REdaS)
library(haven)
mv <- read_dta("/Users/naman/Desktop/microvan.dta")
mv[1] <-NULL
cor <- cor(mv[,colnames(mv)[2:31]])
i = 1
par(mfrow = c(5, 6), mar = c(2,2,2,2))
for (col in mv[2:38]) {
hist(col, main = colnames(mv)[i],col=rainbow(i))
i = i + 1
}
model <- lm(mvliking ~ ., data=mv[1:31])
summary(model)
# Bartlett test of sphericity
bart_spher(mv[,colnames(mv)[2:31]])
# Kaiser-Meyer-Olkin Measure of sampling adequacy
KMOS(mv[,colnames(mv)[2:31]])
# Create a table of results for ease of interpretation
ev <- eigen(cor(mv[,colnames(mv)[2:31]]))$values
e <- data.frame(Eigenvalue = ev, PropOfVar = ev / length(ev), CumPropOfVar = cumsum(ev / length(ev)))
round(e, 4)
p <- ggplot()
p <- p + geom_line(aes(x = 1:length(ev), y = ev))
p <- p + geom_point(aes(x = 1:length(ev), y = ev))
p <- p + geom_hline(yintercept = 1, colour = "red")
p <- p + labs(x = "Number", y = "Eigenvalues", title = "Scree Plot of Eigenvalues")
p <- p + scale_x_continuous(breaks = 1:length(ev), minor_breaks = NULL)
p <- p + theme_bw()
p
n <- length(which(ev > 1))
library(psych)
pc <- principal(mv[,colnames(mv)[2:31]], nfactors = n, rotate="varimax")
fl <- cbind.data.frame(pc$loadings[,], Uniqueness = pc$uniquenesses)
round(fl[order(pc$uniquenesses),], 4)
# View(fl[fl$RC1 > 0.5 | fl$RC1 < -0.5,])
# View(fl[fl$RC2 > 0.5 | fl$RC2 < -0.5,])
# View(fl[fl$RC3 > 0.5 | fl$RC3 < -0.5,])
# View(fl[fl$RC4 > 0.5 | fl$RC4 < -0.5,])
# View(fl[fl$RC5 > 0.5 | fl$RC5 < -0.5,])
factor_scores <- as.data.frame(pc$scores)
final_data <- cbind.data.frame(mv[1],pc$scores)
new_model <- lm(mvliking ~ RC1 + RC2 + RC3 + RC4 + RC5, data=final_data)
summary(new_model)
reduced_model <- lm(mvliking ~ RC1 + RC2 + RC4, data=final_data)
summary(reduced_model)
anova(new_model,reduced_model)
factor_scores$RC3 <- NULL
factor_scores$RC5 <- NULL
d <- dist(pc$scores)
h <- hclust(d, method = "ward.D2")
plot(h, xlab = "Respondent")
z <- scale(factor_scores, center = TRUE, scale = TRUE)
Since the k-means algorithm starts with a random set of centers, setting the seed helps ensure the results are reproducible
set.seed(1)
k <- kmeans(z, centers = 3)
k
k$size
sapply(c("RC1", "RC2","RC4"), function(n) k$centers[, n]*sd(pc$scores[,n]) + mean(pc$scores[,n]))
mv <- cbind(mv, cluster = k$cluster)
mv$mvliking_group <- ifelse(mv$mvliking >= 0 & mv$mvliking <= 3, 'Low Liking ',
ifelse(mv$mvliking >=4 & mv$mvliking <=6, 'Medium Liking',
ifelse(mv$mvliking >=7, 'High Liking', 'something else')))
CrossTable(x = mv$cluster, y = mv$mvliking_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)
mv$age_group <- ifelse(mv$age >= 19 & mv$age <= 35, '19-35',
ifelse(mv$age >= 36 & mv$age <= 50, '36-50',
ifelse( mv$age > 50, '50+','something else')))
mv$income_group <- ifelse(mv$income >= 15 & mv$income <= 50, '0-50k',
ifelse(mv$income >= 51 & mv$income <= 100, '51-100k',
ifelse(mv$income >= 101 & mv$income <= 150, '101-150k',
ifelse( mv$income > 150, '150+k','something else'))))
mv$miles_group <- ifelse(mv$miles >= 0 & mv$miles <= 14, '< 14k',
ifelse(mv$miles >= 15 & mv$miles <= 18, '15-19k',
ifelse(mv$miles >= 19 & mv$miles <= 22, '19-22k',
ifelse( mv$miles > 22, '22+','something else'))))
mv$educ_group <- ifelse(mv$educ >= 1 & mv$educ <= 2, 'Less Formal Education',
ifelse(mv$educ == 3, 'Undergraduate',
ifelse( mv$educ == 4, 'Graduate','something else')))
CrossTable(x = mv$cluster, y = mv$age_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)
CrossTable(x = mv$cluster, y = mv$income_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)
CrossTable(x = mv$cluster, y = mv$miles_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)
CrossTable(x = mv$cluster, y = mv$female, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)
CrossTable(x = mv$cluster, y = mv$recycle, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)
CrossTable(x = mv$cluster, y = mv$numkids, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)
CrossTable(x = mv$cluster, y = mv$educ_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)