Skip to content

Commit

Permalink
Just kidding, packrat is too confusing
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewheiss committed Mar 16, 2017
1 parent 603853f commit 4971091
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 1,210 deletions.
3 changes: 0 additions & 3 deletions .Rprofile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,3 @@ rm(RPROJ)
# OR
# R <- find_rstudio_root_file
# R("Data", "data_raw")
#### -- Packrat Autoloader (version 0.4.8-1) -- ####
source("packrat/init.R")
#### -- End Packrat Autoloader -- ####
4 changes: 0 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,6 @@ Icon?
.Rapp.history
.Rproj.user

# Packrat
packrat/lib*/
packrat/src/

# --------
# Python
# --------
Expand Down
2 changes: 0 additions & 2 deletions Analysis/ingo_survey/.Rprofile
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
RPROJ <- list(PROJHOME = normalizePath("../.."))
attach(RPROJ)
rm(RPROJ)

source("../../packrat/init.R", chdir = TRUE)
170 changes: 85 additions & 85 deletions Analysis/ingo_survey/sandbox.R
Original file line number Diff line number Diff line change
@@ -1,85 +1,85 @@
# # LDA
# library(topicmodels)
# thing_tm <- LDA(dtm.issues, k=5, method="Gibbs",
# control = list(seed = 1234, burnin = 1000,
# thin = 100, iter = 1000))
# thing_tm1 <- LDA(dtm.issues, k=3, method="VEM",
# control = list(seed=1234, estimate.alpha=FALSE))
# topics(thing_tm1, 1)
# terms(thing_tm1, 10)
#
# topics.imputed <- data_frame(topic = 1:4,
# title = c("Human rights, policy, and research",
# "Health and development",
# "Health and advocacy",
# "Advocacy"))
#
# issue.topics <- issue.corpus.df %>%
# mutate(topic = topics(thing_tm1, 1)) %>%
# left_join(topics.imputed, by="topic")
# document.topics <- topics(thing_tm1, 1)
#
# VEM_fixed = LDA(JSS_dtm, k = k,
# + control = list(estimate.alpha = FALSE, seed = SEED))
#
# # Latent semantic analysis
# # https://meefen.github.io/blog/2013/03/11/analyze-text-similarity-in-r-latent-semantic-analysis-and-multidimentional-scaling/
# library(lsa)
# tdm.lsa <- lw_bintf(tdm.issues.mat) * gw_idf(tdm.issues.mat)
# lsa.space <- lsa(tdm.lsa)
# dist.mat.lsa <- dist(t(as.textmatrix(lsa.space))) # compute distance matrix
#
# fit <- cmdscale(dist.mat.lsa, eig = TRUE, k = 2)
# points <- bind_cols(issue.corpus.df,
# data_frame(x = fit$points[, 1], y = fit$points[, 2]))
# ggplot(points, aes(x = x, y = y)) +
# geom_point()
#
# # K-means clustering
# # http://randyzwitch.com/rsitecatalyst-k-means-clustering/
# # Find optimal number of k clusters
# # http://stackoverflow.com/a/15376462/120898
# # Different cost values
# possible.clusters <- 1:25
#
# # Sum of squared error for kmeans + elbow method (look for where slope changes)
# kmeans.cost <- possible.clusters %>%
# map(function(i) kmeans(x=tdm.issues, centers=i, iter.max=100)$tot.withinss)
#
# # Largest average silhouette width for pam (look for max avg.width)
# pam.cost <- possible.clusters %>%
# map(function(i) pam(x=tdm.issues, k=i)$silinfo$avg.width)
#
# # Combine costs for each algorithm into one dataframe
# cost.df <- data_frame(cluster = possible.clusters,
# kmeans.cost = unlist(kmeans.cost),
# pam.cost = c(NA, unlist(pam.cost))) %>%
# mutate(possible.group = case_when(
# .$cluster < 5 ~ 1,
# .$cluster >=5 & .$cluster <= 12 ~ 2,
# .$cluster > 12 ~ 3
# )) %>%
# mutate(possible.group = factor(possible.group))
#
# # kmeans
# ggplot(cost.df, aes(x=cluster, y=kmeans.cost)) +
# geom_line() +
# geom_smooth(aes(colour=possible.group), method="lm", se=FALSE) +
# labs(x="Clusters", y="Within-cluster sum of squares",
# title="Optimal cluster size (kmeans)") +
# guides(colour=FALSE) +
# theme_ath()
#
# # pam
# ggplot(na.omit(cost.df), aes(x=cluster, y=pam.cost)) +
# geom_line() +
# coord_cartesian(xlim=c(0, 25)) +
# labs(x="Clusters", y="Average silouette width",
# title="Optimal cluster size (pam)") +
# theme_ath()
#
# # Create dataframe of clusters and term frequency
# final.cluster <- pam(tdm.issues, k=4)$clustering
# issues.clustered <- data_frame(cluster = final.cluster,
# term = names(final.cluster)) %>%
# right_join(term.frequency, by="term")
# LDA
library(topicmodels)
thing_tm <- LDA(dtm.issues, k=5, method="Gibbs",
control = list(seed = 1234, burnin = 1000,
thin = 100, iter = 1000))
thing_tm1 <- LDA(dtm.issues, k=3, method="VEM",
control = list(seed=1234, estimate.alpha=FALSE))
topics(thing_tm1, 1)
terms(thing_tm1, 10)

topics.imputed <- data_frame(topic = 1:4,
title = c("Human rights, policy, and research",
"Health and development",
"Health and advocacy",
"Advocacy"))

issue.topics <- issue.corpus.df %>%
mutate(topic = topics(thing_tm1, 1)) %>%
left_join(topics.imputed, by="topic")
document.topics <- topics(thing_tm1, 1)

VEM_fixed = LDA(JSS_dtm, k = k,
+ control = list(estimate.alpha = FALSE, seed = SEED))

# Latent semantic analysis
# https://meefen.github.io/blog/2013/03/11/analyze-text-similarity-in-r-latent-semantic-analysis-and-multidimentional-scaling/
library(lsa)
tdm.lsa <- lw_bintf(tdm.issues.mat) * gw_idf(tdm.issues.mat)
lsa.space <- lsa(tdm.lsa)
dist.mat.lsa <- dist(t(as.textmatrix(lsa.space))) # compute distance matrix

fit <- cmdscale(dist.mat.lsa, eig = TRUE, k = 2)
points <- bind_cols(issue.corpus.df,
data_frame(x = fit$points[, 1], y = fit$points[, 2]))
ggplot(points, aes(x = x, y = y)) +
geom_point()

# K-means clustering
# http://randyzwitch.com/rsitecatalyst-k-means-clustering/
# Find optimal number of k clusters
# http://stackoverflow.com/a/15376462/120898
# Different cost values
possible.clusters <- 1:25

# Sum of squared error for kmeans + elbow method (look for where slope changes)
kmeans.cost <- possible.clusters %>%
map(function(i) kmeans(x=tdm.issues, centers=i, iter.max=100)$tot.withinss)

# Largest average silhouette width for pam (look for max avg.width)
pam.cost <- possible.clusters %>%
map(function(i) pam(x=tdm.issues, k=i)$silinfo$avg.width)

# Combine costs for each algorithm into one dataframe
cost.df <- data_frame(cluster = possible.clusters,
kmeans.cost = unlist(kmeans.cost),
pam.cost = c(NA, unlist(pam.cost))) %>%
mutate(possible.group = case_when(
.$cluster < 5 ~ 1,
.$cluster >=5 & .$cluster <= 12 ~ 2,
.$cluster > 12 ~ 3
)) %>%
mutate(possible.group = factor(possible.group))

# kmeans
ggplot(cost.df, aes(x=cluster, y=kmeans.cost)) +
geom_line() +
geom_smooth(aes(colour=possible.group), method="lm", se=FALSE) +
labs(x="Clusters", y="Within-cluster sum of squares",
title="Optimal cluster size (kmeans)") +
guides(colour=FALSE) +
theme_ath()

# pam
ggplot(na.omit(cost.df), aes(x=cluster, y=pam.cost)) +
geom_line() +
coord_cartesian(xlim=c(0, 25)) +
labs(x="Clusters", y="Average silouette width",
title="Optimal cluster size (pam)") +
theme_ath()

# Create dataframe of clusters and term frequency
final.cluster <- pam(tdm.issues, k=4)$clustering
issues.clustered <- data_frame(cluster = final.cluster,
term = names(final.cluster)) %>%
right_join(term.frequency, by="term")
2 changes: 0 additions & 2 deletions Analysis/ngo_regs_regime_stability/.Rprofile
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
RPROJ <- list(PROJHOME = normalizePath("../.."))
attach(RPROJ)
rm(RPROJ)

source("../../packrat/init.R", chdir = TRUE)
Loading

0 comments on commit 4971091

Please sign in to comment.