Skip to content

Commit

Permalink
Merge pull request #8 from FredericBlum/control
Browse files Browse the repository at this point in the history
v0.2rc1
  • Loading branch information
FredericBlum authored May 15, 2024
2 parents 804d7ae + 673e495 commit e0dcba8
Show file tree
Hide file tree
Showing 70 changed files with 1,103 additions and 26,487 deletions.
11 changes: 6 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
*Rproj
*.DS_Store
*Rproj
.Rproj.user
scripts/models/*.rds
doreco/
clts/
scripts/data.tsv
scripts/data_nofilter.tsv
scripts/models/*.rds
scripts/R_groundhog/
scripts/groundhog_libraries_2023-07-17/
doreco/
clts/
scripts/vowels.tsv
*.sh
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ You can now run the download and the conversion to CLDF. While downloading, inse

```bash
cldfbench download cldfbench_doreco.py
cldfbench makecldf cldfbench_doreco.py --glottolog-version v4.8
cldfbench makecldf cldfbench_doreco.py --glottolog-version v5.0
```

The last step created a SQLite database out of the CLDF data, to quickly access all the data. Once you have the SQLite database ready, you need to install the pre-written views. Run SQL:
Expand Down Expand Up @@ -60,6 +60,6 @@ Now you are in a folder full of R-Scripts. The first script `00_setup.R` include
Rscript 00_setup.R
```

Please not that the two scripts which run the brms-model are currently commented out. This has a simple reason: Running them takes several days. If you want to re-run the model, please go to the respective script and run it from there, or un-comment the line in `00_setup.R`. We provide the fitted model within our OSF-repository [here](https://osf.io/tc9zx/?view_only=a658d5da64a7431f87d993b71a8e184c) so that you do not need to re-run the model.
Please not that the two scripts which run the brms-model are currently commented out. This has a simple reason: Running them takes several days. If you want to re-run the model, please go to the respective script and run it from there, or un-comment the line in `00_setup.R`. We provide the fitted model within our OSF-repository [here](https:doi.org/10.17605/OSF.IO/TC9ZX) so that you do not need to re-run the model.

If you want to run code from within the individual R-files, please set the working directory to the `scripts`-folder so that all the code is run correctly.
6 changes: 5 additions & 1 deletion init_query.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@ SELECT
phone.cldf_name AS Value,
1000*phone.duration AS Duration,
word.cldf_languageReference AS Language,
language.family AS Family,
word.speaker_id AS Speaker,
CASE
WHEN phone.cldf_id in (select cldf_id FROM utterance_initials) THEN 1 ELSE 0
END utt_initial, -- whether or not the phone is in utterance initial position
CASE
WHEN phone.cldf_id in (select cldf_id FROM word_initials) THEN 1 ELSE 0
END word_initial, -- whether or not the phone is in word initial position
sound.cldf_cltsReference AS CLTS,
-- normalized word length:
ROUND(((phones_per_word.num_phones - sd_num_phones.avg_num_phones) / sd_num_phones.num_phones), 3) AS z_num_phones,
-- normalized speech rate of the utterance:
Expand All @@ -20,7 +22,8 @@ SELECT
FROM
"phones.csv" AS phone,
"words.csv" AS word, -- word-level metadata joined ON phone.wd_id = word.cldf_id
ParameterTable AS sound -- sound-level metadata joined ON phone.cldf_parameterReference = sound.cldf_id
ParameterTable AS sound, -- sound-level metadata joined ON phone.cldf_parameterReference = sound.cldf_id
LanguageTable AS language
LEFT JOIN
(
SELECT
Expand Down Expand Up @@ -117,6 +120,7 @@ ON
WHERE
phone.wd_id = word.cldf_id AND
phone.cldf_parameterReference = sound.cldf_id AND
word.cldf_languageReference = language.cldf_id AND
-- We only consider non-long, pulmonic consonants ...
sound.cldf_cltsReference LIKE '%_consonant' AND
sound.cldf_cltsReference NOT LIKE '%long%' AND
Expand Down
Binary file modified misc/model_chart.odg
Binary file not shown.
Binary file modified misc/model_chart.pdf
Binary file not shown.
Binary file modified misc/workflow.pdf
Binary file not shown.
650 changes: 489 additions & 161 deletions misc/workflow.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 8 additions & 8 deletions scripts/00_setup.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@ install.packages('groundhog', repos='https://ftp.fau.de/cran/')
library('groundhog')
set.groundhog.folder('./R_groundhog/')
pkgs <- c('brms','viridis', 'readr', 'posterior', 'dplyr', 'ggplot2', 'ggdist',
'gghalves', 'patchwork', 'bayesplot', 'tidybayes', 'xtable', 'ggrepel',
'rnaturalearth', 'rnaturalearthdata', 'tidyr', 'stringr', 'ape',
'geostan', 'geodist', 'gridExtra')
groundhog.library(pkgs, '2023-04-01', force.install=TRUE)
groundhog.library('github::stan-dev/cmdstanr', '2023-08-01', force.install=TRUE)
'gghalves', 'patchwork', 'bayesplot', 'tidybayes', 'xtable', 'tidyr',
'ggrepel', 'rnaturalearth', 'rnaturalearthdata', 'stringr',
'geostan', 'geodist', 'gridExtra', 'extraDistr')
groundhog.library(pkgs, '2024-05-01', force.install=TRUE)
groundhog.library('github::stan-dev/cmdstanr', '2024-05-01', force.install=TRUE)

source('01_DataExplorations.R')
source('02_PriorDistributions.R')
source('03_PriorModel.R')

# The models and posterior predictions will take a long time to compile,
# feel free to run, or to use our provided models via OSF.
# source('04_FinalModel.R')
# source('04_Model.R')
# source('05_ModelConvergence.R')
# source('test_moran.R)

source('06_BayesViz.R')
source('07_utils.R')
source('07_moran.R')
source('08_utils.R')
15 changes: 6 additions & 9 deletions scripts/01_DataExplorations.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,13 @@ library(viridis)
### Data ###
###################################
data <- read_tsv('data.tsv') %>% mutate(
initial=ifelse(
utt_initial==1, "utterance-initial", ifelse(
word_initial==1, "word-initial", "other"
)))
initial=ifelse(utt_initial==1, "utterance-initial", ifelse(word_initial==1, "word-initial", "other")))

langs <- data %>% group_by(Language) %>% count() %>% arrange(Language)
phons <- data %>% group_by(Language, Value) %>% count() %>% arrange(n)
cons <- data %>% group_by(sound_class, utt_initial) %>% count() %>% arrange(n)
cons <- data %>% group_by(CLTS, initial) %>% count() %>% arrange(n)
averages <- data %>% group_by(initial) %>% summarize(avg=mean(Duration))
grand_avg <- mean(data$Duration)

################################################
##### Distribution Plots #####
Expand Down Expand Up @@ -49,8 +48,7 @@ non_tens <- data %>%
ylab("Occurrences") + xlab("duration on log-axis")

distr <- (tens / non_tens) + plot_layout(guides="collect") & theme(legend.position="bottom")
ggsave("images/dataExpl_distr.png", distr, scale=1,
width=2000, height=2000, units="px")
ggsave("images/dataExpl_distr.png", distr, scale=1, width=2000, height=2000, units="px")

dens_all <- data %>%
ggplot(aes(x=initial, y=Duration, color=initial, fill=initial)) +
Expand All @@ -65,8 +63,7 @@ dens_all <- data %>%
scale_x_discrete(labels=c("non-initial", "utterance-initial", "word-initial"))+
xlab("") + theme(legend.position="none")

ggsave("images/dataExpl_dens.png", dens_all, scale=1,
width=2000, height=1450, units="px")
ggsave("images/dataExpl_dens.png", dens_all, scale=1, width=2000, height=1450, units="px")

################################################
##### Between-Languages Plots #####
Expand Down
72 changes: 27 additions & 45 deletions scripts/02_PriorDistributions.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ library(ggdist)
library(patchwork)
library(viridis)

n=1e5
set.seed(42)
n <- 1e5

#########################################
### influence of predictors ###
Expand All @@ -19,76 +19,58 @@ predictors <- tibble(x=c(rnorm(n, 0, 0.3))) %>%
scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
scale_y_continuous(breaks=NULL) +
ylab("Density of values") +
scale_x_continuous(name="Predictor values on log-scale",
limits=c(-1.25, 1.25),
breaks=seq(from=-1, to=1, by=0.5)) +
theme(legend.position="none",
plot.title=element_text(size=14)) +
scale_x_continuous(
name="Predictor values on log-scale",
limits=c(-1.25, 1.25),
breaks=seq(from=-1, to=1, by=0.5)
) +
theme(legend.position="none", plot.title=element_text(size=14)) +
labs(title="β ~ Normal(0, 0.3)")

#########################################
### Intercept priors ###
#########################################

# values for intercept
int_vals <- c(rnorm(n, mean=4.5, sd=0.1))
int_vals %>% tibble() %>% ggplot(aes(x=.)) + geom_density()

# sigma for intercept
sigma1 <- rexp(n, rate=12)
sigma1 %>% tibble() %>% ggplot(aes(x=.))+ geom_density()

# combination of both
sample_ints <- tibble(x=c(rlnorm(n,
meanlog=int_vals,
sdlog=sigma1))) %>%
sample_ints <- tibble(x=c(exp(rnorm(n, mean=4.4, sd=0.05)))) %>%
mutate(group='alpha%~% logn( Normal(4.5, 0.1), exp(12) )') %>%
ggplot(aes(fill=group)) +
geom_density(aes(x=x)) +
scale_x_log10(limits= c(15, 320),
breaks=c(10, 20, 30, 50, 100, 200, 300),
name="Prior distribution for the intercept") +
scale_y_continuous(breaks=NULL,
name="Density of values") +
scale_x_log10(
limits= c(15, 320),
breaks=c(10, 20, 30, 50, 100, 200, 300),
name="Prior distribution for the intercept"
) +
scale_y_continuous(breaks=NULL, name="Density of values") +
scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
theme(legend.position="none",
plot.title=element_text(size=14)) +
labs(title="α ~ logn(Normal(4.5, 0.1), Exp(12))")
theme(legend.position="none", plot.title=element_text(size=14)) +
labs(title="α ~ Normal(4.4, 0.05)")

#########################################
### sigma2 ###
### sd_var ###
#########################################
sigma2 <- rexp(n, rate=12) %>%
sd_var <- rgamma(n, 3, 30) %>%
tibble() %>%
mutate(group='sigma%~% exp(12)') %>%
mutate(group='Gamma') %>%
ggplot(aes(x=.)) +
geom_density(aes(fill=group)) +
scale_y_continuous(breaks=NULL,
name="Density of values") +
scale_x_continuous(breaks=seq(from=0, to=1.2, by=0.2),
limits=c(0, 1.2),
name="Standard deviation of varying intercepts on log-scale") +
scale_y_continuous(breaks=NULL, name="Density of values") +
scale_x_continuous(name="Standard deviation of varying intercepts on log-scale") +
scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
theme(legend.position="none",
plot.title=element_text(size=14)) +
labs(title="σ ~ Exp(12)")
theme(legend.position="none", plot.title=element_text(size=14)) +
labs(title="γ ~ Gamma(3, 30)")

#########################################
### varying slopes matrix ###
#########################################

lkjcorr <- rlkjcorr_marginal(n, K=2, eta=5) %>% tibble(x=.) %>%
mutate(group='R%~% LKJcorr(5)') %>%
ggplot(aes(x=x, fill=group)) +
geom_density() +
scale_y_continuous(breaks=NULL) +
scale_x_continuous(name="Correlation of varying intercepts and slopes",
breaks=c(-1, -0.5, 0, 0.5, 1)) +
scale_x_continuous(name="Correlation of varying intercepts and slopes", breaks=c(-1, -0.5, 0, 0.5, 1)) +
scale_fill_viridis(discrete=T, alpha=0.7, end=0.7) +
theme(legend.position="none",
plot.title=element_text(size=14)) +
theme(legend.position="none", plot.title=element_text(size=14)) +
ylab("Density of values") +
labs(title="R ~ LKJcorr(5)")

all_priors <- (sample_ints + predictors) / (sigma2 + lkjcorr)
ggsave("images/prior_all.png", all_priors, scale=1)
all_priors <- (sample_ints + predictors) / (sd_var + lkjcorr)
ggsave("images/prior_all.png", all_priors, scale=1, width=2500, height=2500, units='px')
Loading

0 comments on commit e0dcba8

Please sign in to comment.