update data loading and posterior predictions

FredericBlum · Apr 11, 2024 · 6dd41b6 · 6dd41b6
1 parent 384b97a
commit 6dd41b6
Show file tree

Hide file tree

Showing 18 changed files with 200 additions and 115 deletions.
diff --git a/scripts/00_setup.R b/scripts/00_setup.R
@@ -1,20 +1,22 @@
-install.packages("groundhog", repos="https://ftp.fau.de/cran/")
-library("groundhog")
-set.groundhog.folder("./R_groundhog/")
-pkgs <- c("brms","viridis", "readr", "posterior", "dplyr", "ggplot2", "ggdist",
-          "gghalves", "patchwork", "bayesplot", "tidybayes", "xtable", "ggrepel",
-          "rnaturalearth", "rnaturalearthdata", "tidyr", "stringr", "ape")
-groundhog.library(pkgs, "2023-12-03", force.install=TRUE)
-groundhog.library('github::stan-dev/cmdstanr', "2023-08-01", force.install=TRUE)
+install.packages('groundhog', repos='https://ftp.fau.de/cran/')
+library('groundhog')
+set.groundhog.folder('./R_groundhog/')
+pkgs <- c('brms','viridis', 'readr', 'posterior', 'dplyr', 'ggplot2', 'ggdist',
+          'gghalves', 'patchwork', 'bayesplot', 'tidybayes', 'xtable', 'ggrepel',
+          'rnaturalearth', 'rnaturalearthdata', 'tidyr', 'stringr', 'ape',
+          'geostan', 'geodist', 'gridExtra')
+groundhog.library(pkgs, '2023-04-01', force.install=TRUE)
+groundhog.library('github::stan-dev/cmdstanr', '2023-08-01', force.install=TRUE)
 
-source("01_DataExplorations.R")
-source("02_PriorDistributions.R")
-source("03_PriorModel.R")
+source('01_DataExplorations.R')
+source('02_PriorDistributions.R')
+source('03_PriorModel.R')
 
-# The models will take a long time to compile, feel free to run, or to use
-# our provided models via OSF.
-# source("04_FinalModel.R")
+# The models and posterior predictions will take a long time to compile,
+# feel free to run, or to use our provided models via OSF.
+# source('04_FinalModel.R')
+# source('05_ModelConvergence.R')
+# source('test_moran.R)
 
-source("05_ModelConvergence.R")
-source("06_BayesViz.R")
-source("07_utils.R")
+source('06_BayesViz.R')
+source('07_utils.R')
diff --git a/scripts/01_DataExplorations.R b/scripts/01_DataExplorations.R
@@ -9,14 +9,11 @@ library(viridis)
 ###################################
 ###       Data                  ###
 ###################################
-data <- read_tsv('data.tsv') %>% 
-  mutate(sound_class=paste(voicing, sound_class),
-         word_initial=as.factor(word_initial),
-         utt_initial=as.factor(utt_initial),
-         initial=ifelse(
-           utt_initial==1, "utterance-initial", ifelse(
-             word_initial==1, "word-initial", "other"
-           )))
+data <- read_tsv('data.tsv') %>% mutate(
+  initial=ifelse(
+    utt_initial==1, "utterance-initial", ifelse(
+      word_initial==1, "word-initial", "other"
+      )))
 
 langs <- data %>% group_by(Language) %>% count() %>% arrange(Language)
 phons <- data %>% group_by(Language, Value) %>% count() %>% arrange(n)

diff --git a/scripts/02_PriorDistributions.R b/scripts/02_PriorDistributions.R
@@ -5,90 +5,90 @@ library(ggdist)
 library(patchwork)
 library(viridis)
 
-n = 1e5
+n=1e5
 set.seed(42)
 
 #########################################
 ###     influence of predictors       ###
 #########################################
 
-predictors <- tibble(x = c(rnorm(n, 0, 0.3))) %>%
-  mutate(group = 'beta%~% Normal(0, 0.3)') %>% 
-  ggplot(aes(x = x)) +
-  geom_density(aes(fill = group)) +
-  scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
-  scale_y_continuous(breaks = NULL) +
+predictors <- tibble(x=c(rnorm(n, 0, 0.3))) %>%
+  mutate(group='beta%~% Normal(0, 0.3)') %>% 
+  ggplot(aes(x=x)) +
+  geom_density(aes(fill=group)) +
+  scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
+  scale_y_continuous(breaks=NULL) +
   ylab("Density of values") +
-  scale_x_continuous(name = "Predictor values on log-scale", 
-                     limits = c(-1.25, 1.25), 
-                     breaks = seq(from = -1, to = 1, by = 0.5)) +
-  theme(legend.position = "none",
-        plot.title = element_text(size = 14)) +
-  labs(title = "β ~ Normal(0, 0.3)")
+  scale_x_continuous(name="Predictor values on log-scale", 
+                     limits=c(-1.25, 1.25), 
+                     breaks=seq(from=-1, to=1, by=0.5)) +
+  theme(legend.position="none",
+        plot.title=element_text(size=14)) +
+  labs(title="β ~ Normal(0, 0.3)")
 
 #########################################
 ###     Intercept priors              ###
 #########################################
 
 # values for intercept
-int_vals <- c(rnorm(n, mean = 4.5, sd = 0.1)) 
-int_vals %>% tibble() %>% ggplot(aes(x = .)) + geom_density()
+int_vals <- c(rnorm(n, mean=4.5, sd=0.1)) 
+int_vals %>% tibble() %>% ggplot(aes(x=.)) + geom_density()
 
 # sigma for intercept
-sigma1 <- rexp(n, rate = 12)
+sigma1 <- rexp(n, rate=12)
 sigma1 %>% tibble() %>% ggplot(aes(x=.))+ geom_density()
 
 # combination of both
-sample_ints <- tibble(x = c(rlnorm(n, 
-                                  meanlog = int_vals, 
-                                  sdlog = sigma1))) %>%
-  mutate(group = 'alpha%~% logn( Normal(4.5, 0.1), exp(12) )') %>% 
-  ggplot(aes(fill = group)) +
-  geom_density(aes(x = x)) +
+sample_ints <- tibble(x=c(rlnorm(n, 
+                                  meanlog=int_vals, 
+                                  sdlog=sigma1))) %>%
+  mutate(group='alpha%~% logn( Normal(4.5, 0.1), exp(12) )') %>% 
+  ggplot(aes(fill=group)) +
+  geom_density(aes(x=x)) +
   scale_x_log10(limits= c(15, 320),
-                breaks = c(10, 20, 30, 50, 100, 200, 300),
-                name = "Prior distribution for the intercept") +
-  scale_y_continuous(breaks = NULL,
-                     name = "Density of values") +
-  scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
-  theme(legend.position = "none",
-        plot.title = element_text(size = 14)) +
-  labs(title = "α ~ logn(Normal(4.5, 0.1), Exp(12))")
+                breaks=c(10, 20, 30, 50, 100, 200, 300),
+                name="Prior distribution for the intercept") +
+  scale_y_continuous(breaks=NULL,
+                     name="Density of values") +
+  scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
+  theme(legend.position="none",
+        plot.title=element_text(size=14)) +
+  labs(title="α ~ logn(Normal(4.5, 0.1), Exp(12))")
 
 #########################################
 ###         sigma2                    ###
 #########################################
-sigma2 <- rexp(n, rate = 12) %>% 
+sigma2 <- rexp(n, rate=12) %>% 
   tibble() %>% 
-  mutate(group = 'sigma%~% exp(12)') %>% 
+  mutate(group='sigma%~% exp(12)') %>% 
   ggplot(aes(x=.)) + 
-  geom_density(aes(fill = group)) +
-  scale_y_continuous(breaks = NULL,
-                     name = "Density of values") +
-  scale_x_continuous(breaks = seq(from = 0, to = 1.2, by = 0.2),
-                     limits = c(0, 1.2),
-                     name = "Standard deviation of varying intercepts on log-scale") +
-  scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
-  theme(legend.position = "none",
-        plot.title = element_text(size = 14)) +  
-  labs(title = "σ ~ Exp(12)")
+  geom_density(aes(fill=group)) +
+  scale_y_continuous(breaks=NULL,
+                     name="Density of values") +
+  scale_x_continuous(breaks=seq(from=0, to=1.2, by=0.2),
+                     limits=c(0, 1.2),
+                     name="Standard deviation of varying intercepts on log-scale") +
+  scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
+  theme(legend.position="none",
+        plot.title=element_text(size=14)) +  
+  labs(title="σ ~ Exp(12)")
 
 #########################################
 ###     varying slopes matrix         ###
 #########################################
 
-lkjcorr <- rlkjcorr_marginal(n, K = 2, eta = 5) %>% tibble(x = .) %>% 
-  mutate(group = 'R%~% LKJcorr(5)') %>% 
-  ggplot(aes(x = x, fill = group)) + 
+lkjcorr <- rlkjcorr_marginal(n, K=2, eta=5) %>% tibble(x=.) %>% 
+  mutate(group='R%~% LKJcorr(5)') %>% 
+  ggplot(aes(x=x, fill=group)) + 
   geom_density() +
-  scale_y_continuous(breaks = NULL) +
-  scale_x_continuous(name = "Correlation of varying intercepts and slopes",
-                     breaks = c(-1, -0.5, 0, 0.5, 1)) +
-  scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
-  theme(legend.position = "none",
-        plot.title = element_text(size = 14)) +
+  scale_y_continuous(breaks=NULL) +
+  scale_x_continuous(name="Correlation of varying intercepts and slopes",
+                     breaks=c(-1, -0.5, 0, 0.5, 1)) +
+  scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
+  theme(legend.position="none",
+        plot.title=element_text(size=14)) +
   ylab("Density of values") +
-  labs(title = "R ~ LKJcorr(5)")
+  labs(title="R ~ LKJcorr(5)")
 
 all_priors <- (sample_ints + predictors) / (sigma2 + lkjcorr)
-ggsave("images/prior_all.png", all_priors, scale = 1)
+ggsave("images/prior_all.png", all_priors, scale=1)
diff --git a/scripts/04_FinalModel.R b/scripts/04_FinalModel.R
@@ -9,17 +9,10 @@ library(ape)
 langs <- read_csv('languages.csv')
 data <- read_tsv('data.tsv') %>% 
   mutate(
-    word_initial = as.factor(word_initial),
-    utt_initial = as.factor(utt_initial),
     cluster = as.factor(ifelse(InCluster==0, 'single', ifelse(
       ClusterInitial==1, 'initial', 'nonInitial')
-    ))
-  ) %>% 
-  left_join(langs, by = join_by(Language==ID)) %>% 
-  rename(
-    IPA = CLTS,
-    Family = name_macro_family
-  )
+    ))) %>% 
+  left_join(langs, by = join_by(Language==ID))
 
 # Read in phylogenetic control
 df_phylo <- read_rds("df-phylo.rds")
@@ -36,19 +29,19 @@ model <-
       formula=Duration ~ 1 + utt_initial + word_initial + cluster +
         (1 + utt_initial + word_initial + cluster | Language) +
         (1 | Speaker) + (1 | gr(phylo, cov=phylo)) +
-        (1 + word_initial + utt_initial | IPA) +
+        (1 + word_initial + utt_initial | CLTS) +
         z_num_phones + z_word_freq + z_speech_rate,
       prior=c(prior(normal(4.5, 0.1), class=Intercept),
               prior(normal(6, 0.5), class=shape),
               prior(normal(0, 0.3), class=b),
               prior(exponential(12), class=sd),
               prior(lkj(5), class=cor)
       ),
-      iter=2000, warmup=1000, chains=4, cores=4,
+      iter=20, warmup=10, chains=4, cores=4,
       control=list(adapt_delta=0.80, max_treedepth=10),
       thread=threading(3),
       seed=1,
       silent=0,
-      file="models/cl_hierarchical",
+      file="models/cl_final",
       backend="cmdstanr"
   )