Merge pull request #8 from FredericBlum/control

v0.2rc1
FredericBlum · May 15, 2024 · e0dcba8 · e0dcba8
2 parents 804d7ae + 673e495
commit e0dcba8
Show file tree

Hide file tree

Showing 70 changed files with 1,103 additions and 26,487 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,10 +1,11 @@
-*Rproj
 *.DS_Store
+*Rproj
 .Rproj.user
-scripts/models/*.rds
+doreco/
+clts/
 scripts/data.tsv
+scripts/data_nofilter.tsv
+scripts/models/*.rds
 scripts/R_groundhog/
 scripts/groundhog_libraries_2023-07-17/
-doreco/
-clts/
-scripts/vowels.tsv
+*.sh
diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ You can now run the download and the conversion to CLDF. While downloading, inse
 
 ```bash
 cldfbench download cldfbench_doreco.py
-cldfbench makecldf cldfbench_doreco.py --glottolog-version v4.8
+cldfbench makecldf cldfbench_doreco.py --glottolog-version v5.0
 ```
 
 The last step created a SQLite database out of the CLDF data, to quickly access all the data. Once you have the SQLite database ready, you need to install the pre-written views. Run SQL:
@@ -60,6 +60,6 @@ Now you are in a folder full of R-Scripts. The first script `00_setup.R` include
 Rscript 00_setup.R
 ```
 
-Please not that the two scripts which run the brms-model are currently commented out. This has a simple reason: Running them takes several days. If you want to re-run the model, please go to the respective script and run it from there, or un-comment the line in `00_setup.R`. We provide the fitted model within our OSF-repository [here](https://osf.io/tc9zx/?view_only=a658d5da64a7431f87d993b71a8e184c) so that you do not need to re-run the model.
+Please not that the two scripts which run the brms-model are currently commented out. This has a simple reason: Running them takes several days. If you want to re-run the model, please go to the respective script and run it from there, or un-comment the line in `00_setup.R`. We provide the fitted model within our OSF-repository [here](https:doi.org/10.17605/OSF.IO/TC9ZX) so that you do not need to re-run the model.
 
 If you want to run code from within the individual R-files, please set the working directory to the `scripts`-folder so that all the code is run correctly.
diff --git a/init_query.sql b/init_query.sql
@@ -3,13 +3,15 @@ SELECT
 	phone.cldf_name AS Value,
 	1000*phone.duration AS Duration,
     word.cldf_languageReference AS Language,
+	language.family AS Family,
     word.speaker_id AS Speaker,
     CASE
         WHEN phone.cldf_id in (select cldf_id FROM utterance_initials) THEN 1 ELSE 0
         END utt_initial, -- whether or not the phone is in utterance initial position
     CASE
         WHEN phone.cldf_id in (select cldf_id FROM word_initials) THEN 1 ELSE 0
         END word_initial, -- whether or not the phone is in word initial position
+	sound.cldf_cltsReference AS CLTS,
     -- normalized word length:
 	ROUND(((phones_per_word.num_phones - sd_num_phones.avg_num_phones) / sd_num_phones.num_phones), 3) AS z_num_phones,
 	-- normalized speech rate of the utterance:
@@ -20,7 +22,8 @@ SELECT
 FROM
     "phones.csv" AS phone,
     "words.csv" AS word, -- word-level metadata joined ON phone.wd_id = word.cldf_id
-    ParameterTable AS sound -- sound-level metadata joined ON phone.cldf_parameterReference = sound.cldf_id
+    ParameterTable AS sound, -- sound-level metadata joined ON phone.cldf_parameterReference = sound.cldf_id
+	LanguageTable AS language
 LEFT JOIN
     (
         SELECT
@@ -117,6 +120,7 @@ ON
 WHERE
     phone.wd_id = word.cldf_id AND
     phone.cldf_parameterReference = sound.cldf_id AND
+	word.cldf_languageReference = language.cldf_id AND
     -- We only consider non-long, pulmonic consonants ...
     sound.cldf_cltsReference LIKE '%_consonant' AND
     sound.cldf_cltsReference NOT LIKE '%long%' AND

diff --git a/misc/model_chart.odg b/misc/model_chart.odg
diff --git a/misc/model_chart.pdf b/misc/model_chart.pdf
diff --git a/misc/workflow.pdf b/misc/workflow.pdf
diff --git a/misc/workflow.svg b/misc/workflow.svg
diff --git a/scripts/00_setup.R b/scripts/00_setup.R
@@ -2,21 +2,21 @@ install.packages('groundhog', repos='https://ftp.fau.de/cran/')
 library('groundhog')
 set.groundhog.folder('./R_groundhog/')
 pkgs <- c('brms','viridis', 'readr', 'posterior', 'dplyr', 'ggplot2', 'ggdist',
-          'gghalves', 'patchwork', 'bayesplot', 'tidybayes', 'xtable', 'ggrepel',
-          'rnaturalearth', 'rnaturalearthdata', 'tidyr', 'stringr', 'ape',
-          'geostan', 'geodist', 'gridExtra')
-groundhog.library(pkgs, '2023-04-01', force.install=TRUE)
-groundhog.library('github::stan-dev/cmdstanr', '2023-08-01', force.install=TRUE)
+          'gghalves', 'patchwork', 'bayesplot', 'tidybayes', 'xtable', 'tidyr',
+          'ggrepel', 'rnaturalearth', 'rnaturalearthdata', 'stringr',
+          'geostan', 'geodist', 'gridExtra', 'extraDistr')
+groundhog.library(pkgs, '2024-05-01', force.install=TRUE)
+groundhog.library('github::stan-dev/cmdstanr', '2024-05-01', force.install=TRUE)
 
 source('01_DataExplorations.R')
 source('02_PriorDistributions.R')
 source('03_PriorModel.R')
 
 # The models and posterior predictions will take a long time to compile,
 # feel free to run, or to use our provided models via OSF.
-# source('04_FinalModel.R')
+# source('04_Model.R')
 # source('05_ModelConvergence.R')
-# source('test_moran.R)
 
 source('06_BayesViz.R')
-source('07_utils.R')
+source('07_moran.R')
+source('08_utils.R')
diff --git a/scripts/01_DataExplorations.R b/scripts/01_DataExplorations.R
@@ -10,14 +10,13 @@ library(viridis)
 ###       Data                  ###
 ###################################
 data <- read_tsv('data.tsv') %>% mutate(
-  initial=ifelse(
-    utt_initial==1, "utterance-initial", ifelse(
-      word_initial==1, "word-initial", "other"
-      )))
+  initial=ifelse(utt_initial==1, "utterance-initial", ifelse(word_initial==1, "word-initial", "other")))
 
 langs <- data %>% group_by(Language) %>% count() %>% arrange(Language)
 phons <- data %>% group_by(Language, Value) %>% count() %>% arrange(n)
-cons <- data %>% group_by(sound_class, utt_initial) %>% count() %>% arrange(n)
+cons <- data %>% group_by(CLTS, initial) %>% count() %>% arrange(n)
+averages <- data %>% group_by(initial) %>% summarize(avg=mean(Duration))
+grand_avg <- mean(data$Duration)
 
 ################################################
 #####       Distribution Plots             #####
@@ -49,8 +48,7 @@ non_tens <- data %>%
   ylab("Occurrences") + xlab("duration on log-axis")
 
 distr <- (tens / non_tens) + plot_layout(guides="collect") & theme(legend.position="bottom")
-ggsave("images/dataExpl_distr.png", distr, scale=1,
-       width=2000, height=2000, units="px")
+ggsave("images/dataExpl_distr.png", distr, scale=1, width=2000, height=2000, units="px")
 
 dens_all <- data %>%
   ggplot(aes(x=initial, y=Duration, color=initial, fill=initial)) +
@@ -65,8 +63,7 @@ dens_all <- data %>%
   scale_x_discrete(labels=c("non-initial", "utterance-initial", "word-initial"))+
   xlab("") + theme(legend.position="none")
 
-ggsave("images/dataExpl_dens.png", dens_all, scale=1,
-       width=2000, height=1450, units="px")
+ggsave("images/dataExpl_dens.png", dens_all, scale=1, width=2000, height=1450, units="px")
 
 ################################################
 #####       Between-Languages Plots        #####

diff --git a/scripts/02_PriorDistributions.R b/scripts/02_PriorDistributions.R
@@ -5,8 +5,8 @@ library(ggdist)
 library(patchwork)
 library(viridis)
 
-n=1e5
 set.seed(42)
+n <- 1e5
 
 #########################################
 ###     influence of predictors       ###
@@ -19,76 +19,58 @@ predictors <- tibble(x=c(rnorm(n, 0, 0.3))) %>%
   scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
   scale_y_continuous(breaks=NULL) +
   ylab("Density of values") +
-  scale_x_continuous(name="Predictor values on log-scale", 
-                     limits=c(-1.25, 1.25), 
-                     breaks=seq(from=-1, to=1, by=0.5)) +
-  theme(legend.position="none",
-        plot.title=element_text(size=14)) +
+  scale_x_continuous(
+    name="Predictor values on log-scale",
+    limits=c(-1.25, 1.25), 
+    breaks=seq(from=-1, to=1, by=0.5)
+    ) +
+  theme(legend.position="none", plot.title=element_text(size=14)) +
   labs(title="β ~ Normal(0, 0.3)")
 
 #########################################
 ###     Intercept priors              ###
 #########################################
-
-# values for intercept
-int_vals <- c(rnorm(n, mean=4.5, sd=0.1)) 
-int_vals %>% tibble() %>% ggplot(aes(x=.)) + geom_density()
-
-# sigma for intercept
-sigma1 <- rexp(n, rate=12)
-sigma1 %>% tibble() %>% ggplot(aes(x=.))+ geom_density()
-
-# combination of both
-sample_ints <- tibble(x=c(rlnorm(n, 
-                                  meanlog=int_vals, 
-                                  sdlog=sigma1))) %>%
+sample_ints <- tibble(x=c(exp(rnorm(n, mean=4.4, sd=0.05)))) %>%
   mutate(group='alpha%~% logn( Normal(4.5, 0.1), exp(12) )') %>% 
   ggplot(aes(fill=group)) +
   geom_density(aes(x=x)) +
-  scale_x_log10(limits= c(15, 320),
-                breaks=c(10, 20, 30, 50, 100, 200, 300),
-                name="Prior distribution for the intercept") +
-  scale_y_continuous(breaks=NULL,
-                     name="Density of values") +
+  scale_x_log10(
+    limits= c(15, 320),
+    breaks=c(10, 20, 30, 50, 100, 200, 300),
+    name="Prior distribution for the intercept"
+    ) +
+  scale_y_continuous(breaks=NULL, name="Density of values") +
   scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
-  theme(legend.position="none",
-        plot.title=element_text(size=14)) +
-  labs(title="α ~ logn(Normal(4.5, 0.1), Exp(12))")
+  theme(legend.position="none", plot.title=element_text(size=14)) +
+  labs(title="α ~ Normal(4.4, 0.05)")
 
 #########################################
-###         sigma2                    ###
+###         sd_var                    ###
 #########################################
-sigma2 <- rexp(n, rate=12) %>% 
+sd_var <- rgamma(n, 3, 30) %>% 
   tibble() %>% 
-  mutate(group='sigma%~% exp(12)') %>% 
+  mutate(group='Gamma') %>% 
   ggplot(aes(x=.)) + 
   geom_density(aes(fill=group)) +
-  scale_y_continuous(breaks=NULL,
-                     name="Density of values") +
-  scale_x_continuous(breaks=seq(from=0, to=1.2, by=0.2),
-                     limits=c(0, 1.2),
-                     name="Standard deviation of varying intercepts on log-scale") +
+  scale_y_continuous(breaks=NULL, name="Density of values") +
+  scale_x_continuous(name="Standard deviation of varying intercepts on log-scale") +
   scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
-  theme(legend.position="none",
-        plot.title=element_text(size=14)) +  
-  labs(title="σ ~ Exp(12)")
+  theme(legend.position="none", plot.title=element_text(size=14)) +  
+  labs(title="γ ~ Gamma(3, 30)")
 
 #########################################
 ###     varying slopes matrix         ###
 #########################################
-
 lkjcorr <- rlkjcorr_marginal(n, K=2, eta=5) %>% tibble(x=.) %>% 
   mutate(group='R%~% LKJcorr(5)') %>% 
   ggplot(aes(x=x, fill=group)) + 
   geom_density() +
   scale_y_continuous(breaks=NULL) +
-  scale_x_continuous(name="Correlation of varying intercepts and slopes",
-                     breaks=c(-1, -0.5, 0, 0.5, 1)) +
+  scale_x_continuous(name="Correlation of varying intercepts and slopes", breaks=c(-1, -0.5, 0, 0.5, 1)) +
   scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
-  theme(legend.position="none",
-        plot.title=element_text(size=14)) +
+  theme(legend.position="none", plot.title=element_text(size=14)) +
   ylab("Density of values") +
   labs(title="R ~ LKJcorr(5)")
 
-all_priors <- (sample_ints + predictors) / (sigma2 + lkjcorr)
-ggsave("images/prior_all.png", all_priors, scale=1)
+all_priors <- (sample_ints + predictors) / (sd_var + lkjcorr)
+ggsave("images/prior_all.png", all_priors, scale=1, width=2500, height=2500, units='px')