From 5b2e2bd1d162378d053da232e02446ab1d87d67e Mon Sep 17 00:00:00 2001 From: Gede Primahadi Wijaya Rajeg Date: Sat, 6 Jul 2024 15:00:24 +0800 Subject: [PATCH] Re-generate CLDF with notes. --- .Rhistory | 608 +++++++++++----------- .zenodo.json | 14 +- CONTRIBUTORS.md | 3 +- LICENSE | 7 +- README.md | 16 +- cldf/.transcription-report.json | 16 +- cldf/README.md | 4 +- cldf/cldf-metadata.json | 4 +- cldf/lingpy-rcParams.json | 2 +- metadata.json | 2 +- tutorial-step-notes-to-create-the-cldf.sh | 2 +- 11 files changed, 343 insertions(+), 335 deletions(-) diff --git a/.Rhistory b/.Rhistory index 80b29a4..333f7e9 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,15 +1,34 @@ -ordering = NULL) -ipa$strings <- ipa$strings |> -rename(Mentawai = originals, -IPA = transliterated) |> -as_tibble() -ipa$strings -ortho$strings -mentawai1853 +# join the English translation of the Dutch gloss with the main word list table +mentawai1853 <- mentawai1853 |> +left_join(dutch) |> +mutate(English_DeepL2 = if_else(untranslated, English_DeepL, English_DeepL2)) |> +mutate(eng_diff = English_DeepL != English_DeepL2) +# this is a testing code to check which original translation is different from the DeepL mentawai1853 |> -left_join(ortho$strings |> -select(-tokenized)) -ortho$strings +mutate(eng_diff = English_DeepL != English_DeepL2) |> +filter(eng_diff) |> +select(English_DeepL, English_DeepL2, Dutch) |> +as.data.frame() +# read the Concepticon mapping file +concepticon <- read_tsv("data/mentawai-gloss-mapped-to-edit_1853-270.tsv") |> +filter(GLOSS != "#") |> +select(-NUMBER) |> +distinct() |> +mutate(across(where(is.character), ~replace_na(., ""))) +# join the Concepticon mapping with the main word list table +mentawai1853 <- mentawai1853 |> +left_join(concepticon |> +rename(English_DeepL2 = GLOSS)) +# exclude the irrelevant columns +mentawai1853 <- mentawai1853 |> +select(-untranslated, -eng_diff, -English_DeepL) |> +select(ID, Mentawai, Dutch, English = English_DeepL2, CONCEPTICON_GLOSS, CONCEPTICON_ID, everything()) +# save the main word list data containing the English translation and Concepticon Mapping +## the saved files still combine the three categories of word list +mentawai1853 |> +write_tsv("data/mentawai1853.tsv") +mentawai1853 |> +write_rds("data/mentawai1853.rds") ortho <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Replacement", @@ -21,44 +40,43 @@ rename(Mentawai = originals, Commons = transliterated) |> as_tibble() |> mutate(ID = mentawai1853$ID) -ipa <- qlcData::tokenize(mentawai1853$Mentawai, -profile = "data/ortho-profile-mentawai1853.tsv", -transliterate = "Phoneme", -sep.replace = "_", -regex = TRUE, -ordering = NULL) -ipa$strings <- ipa$strings |> +ortho$strings +ortho$strings |> rename(Mentawai = originals, -IPA = transliterated) |> +Commons = transliterated) |> as_tibble() |> -mutate(ID = mentawai1853$ID) -ipa$strings -mentawai1853 |> -left_join(ortho$strings |> -select(-tokenized, -Mentawai)) |> -left_join(ipa$strings |> -select(-tokenized, -Mentawai)) -mentawai1853 |> -left_join(ortho$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) |> -left_join(ipa$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) -mentawai1853 |> -left_join(ortho$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) |> -left_join(ipa$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) |> -select(ID, Mentawai, Commons, IPA, Dutch, English, everything()) +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised common transcription +mutate(CommonsNotSegmented = str_replace_all(Commons, " ", "")) ortho <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Replacement", sep.replace = "_", regex = TRUE, ordering = NULL) +ortho$strings |> +rename(Mentawai = originals, +Commons = transliterated) |> +as_tibble() |> +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised common transcription +mutate(CommonsNotSegmented = str_replace_all(Commons, " ", "")) +ortho$strings |> +rename(Mentawai = originals, +Commons = transliterated) |> +as_tibble() |> +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised common transcription +mutate(CommonsNotSegmented = str_replace_all(Commons, " ", ""), +CommonsNotSegmented = str_replace_all(CommonsNonSegmented, "_", " ")) +ortho$strings |> +rename(Mentawai = originals, +Commons = transliterated) |> +as_tibble() |> +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised common transcription +mutate(CommonsNotSegmented = str_replace_all(Commons, " ", ""), +CommonsNotSegmented = str_replace_all(CommonsNotSegmented, "_", " ")) ortho <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Replacement", @@ -69,100 +87,60 @@ ortho$strings <- ortho$strings |> rename(Mentawai = originals, Commons = transliterated) |> as_tibble() |> -mutate(ID = mentawai1853$ID) -ortho$strings -ortho$strings |> filter(str_detect(Mentawai, "ng")) -ortho <- qlcData::tokenize(mentawai1853$Mentawai, +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised common transcription +mutate(CommonsNotSegmented = str_replace_all(Commons, " ", ""), +CommonsNotSegmented = str_replace_all(CommonsNotSegmented, "_", " ")) +ipa <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", -transliterate = "Replacement", +transliterate = "Phoneme", sep.replace = "_", regex = TRUE, ordering = NULL) -ortho$strings <- ortho$strings |> +ipa$strings |> rename(Mentawai = originals, -Commons = transliterated) |> +IPA = transliterated) |> as_tibble() |> -mutate(ID = mentawai1853$ID) -ortho$strings |> filter(str_detect(Mentawai, "ng")) +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised IPA transcription +mutate(IPAnotSegmented = str_replace_all(IPA, " ", ""), +IPAnotSegmented = str_replace_all(IPAnotSegmented, "_", " ")) ortho <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Replacement", sep.replace = "_", regex = TRUE, ordering = NULL) -ortho$strings <- ortho$strings |> +ortho$strings |> rename(Mentawai = originals, Commons = transliterated) |> as_tibble() |> mutate(ID = mentawai1853$ID) -ortho$strings |> filter(str_detect(Mentawai, "ng")) ipa <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Phoneme", sep.replace = "_", regex = TRUE, ordering = NULL) -ipa$strings <- ipa$strings |> +ipa$strings |> rename(Mentawai = originals, IPA = transliterated) |> as_tibble() |> mutate(ID = mentawai1853$ID) -ipa$strings |> filter(str_detect(Mentawai, "ng")) -source("codes/mentawai1853_00-gsheet.R") -# read the Google Sheet file -mentawai1853 <- read_sheet(mentawai1853_sheet) |> -mutate(ID = row_number()) # give entry ID -# translate the Dutch gloss with deeplr <- CODE to translate the Dutch into English using DeepL API -# dutch <- mentawai1853 |> -# select(ID, Dutch) |> -# filter(!is.na(Dutch)) -# dutch <- dutch |> -# mutate(English_DeepL2 = deeplr::translate2(text = Dutch, target_lang = "EN", source_lang = "NL", auth_key = deeplauthkey)) -# write_tsv(dutch, "data/dutch-translated-to-english-with-deepl.tsv") <- This file is manually edited together with the edited Concepticon mapping file -# read the English translation of the Dutch gloss -dutch <- read_tsv("data/dutch-translated-to-english-with-deepl.tsv") |> -mutate(untranslated = Dutch == English_DeepL2) -# join the English translation of the Dutch gloss with the main word list table -mentawai1853 <- mentawai1853 |> -left_join(dutch) |> -mutate(English_DeepL2 = if_else(untranslated, English_DeepL, English_DeepL2)) |> -mutate(eng_diff = English_DeepL != English_DeepL2) -# this is a testing code to check which original translation is different from the DeepL -mentawai1853 |> -mutate(eng_diff = English_DeepL != English_DeepL2) |> -filter(eng_diff) |> -select(English_DeepL, English_DeepL2, Dutch) |> -as.data.frame() -# read the Concepticon mapping file -concepticon <- read_tsv("data/mentawai-gloss-mapped-to-edit_1853-270.tsv") |> -filter(GLOSS != "#") |> -select(-NUMBER) |> -distinct() |> -mutate(across(where(is.character), ~replace_na(., ""))) -# join the Concepticon mapping with the main word list table -mentawai1853 <- mentawai1853 |> -left_join(concepticon |> -rename(English_DeepL2 = GLOSS)) -# exclude the irrelevant columns -mentawai1853 <- mentawai1853 |> -select(-untranslated, -eng_diff, -English_DeepL) |> -select(ID, Mentawai, Dutch, English = English_DeepL2, CONCEPTICON_GLOSS, CONCEPTICON_ID, everything()) -# save the main word list data containing the English translation and Concepticon Mapping -## the saved files still combine the three categories of word list -mentawai1853 |> -write_tsv("data/mentawai1853.tsv") -mentawai1853 |> -write_rds("data/mentawai1853.rds") -# save the comparison table in p. 434 into .tsv -# read_tsv("data/vrosenberg1853p434.csv") |> -# write_tsv("data/vrosenberg1853p434.tsv") -# run orthography profile ===== -# write the profile and manually edit it. -# mentawai1853$Mentawai |> -# qlcData::write.profile(normalize = "NFC", -# editing = TRUE, -# info = TRUE, -# file.out = "data/ortho-profile-mentawai1853.tsv") +ipa <- qlcData::tokenize(mentawai1853$Mentawai, +profile = "data/ortho-profile-mentawai1853.tsv", +transliterate = "Phoneme", +sep.replace = "_", +regex = TRUE, +ordering = NULL) +ipa$strings |> +rename(Mentawai = originals, +IPA = transliterated) |> +as_tibble() |> +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised IPA transcription +mutate(IPAnotSegmented = str_replace_all(IPA, " ", ""), +IPAnotSegmented = str_replace_all(IPAnotSegmented, "_", " ")) ortho <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Replacement", @@ -173,7 +151,10 @@ ortho$strings <- ortho$strings |> rename(Mentawai = originals, Commons = transliterated) |> as_tibble() |> -mutate(ID = mentawai1853$ID) +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised common transcription +mutate(CommonsNotSegmented = str_replace_all(Commons, " ", ""), +CommonsNotSegmented = str_replace_all(CommonsNotSegmented, "_", " ")) ipa <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Phoneme", @@ -184,72 +165,46 @@ ipa$strings <- ipa$strings |> rename(Mentawai = originals, IPA = transliterated) |> as_tibble() |> -mutate(ID = mentawai1853$ID) -mentawai1853 <- mentawai1853 |> -left_join(ortho$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) |> -left_join(ipa$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) |> -select(ID, Mentawai, Commons, IPA, Dutch, English, everything()) +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised IPA transcription +mutate(IPAnotSegmented = str_replace_all(IPA, " ", ""), +IPAnotSegmented = str_replace_all(IPAnotSegmented, "_", " ")) ipa$strings -ortho$strings -mentawai1853 -# split the comparison dialect word list and men's names and save the respective files -# mentawai1853 <- read_rds("data/mentawai1853.rds") -mentawai1853 |> -filter(Category == "word list") |> -write_tsv("data/mentawai1853.tsv") -mentawai1853 |> -filter(Category == "word list") |> -write_tsv("data/mentawai1853.rds") -mentawai1853 |> -filter(Category == "dialect-comparison") |> -write_tsv("data/mentawai1853-dialect-comparison.tsv") -mentawai1853 |> -filter(Category == "men's names") |> -write_tsv("data/mentawai1853-men-names.tsv") -# get the Google Sheet link -source("codes/mentawai1853_00-gsheet.R") -# read the Google Sheet file -mentawai1853 <- read_sheet(mentawai1853_sheet) |> -mutate(ID = row_number()) # give entry ID -# read the English translation of the Dutch gloss -dutch <- read_tsv("data/dutch-translated-to-english-with-deepl.tsv") |> -mutate(untranslated = Dutch == English_DeepL2) -# join the English translation of the Dutch gloss with the main word list table -mentawai1853 <- mentawai1853 |> -left_join(dutch) |> -mutate(English_DeepL2 = if_else(untranslated, English_DeepL, English_DeepL2)) |> -mutate(eng_diff = English_DeepL != English_DeepL2) -# this is a testing code to check which original translation is different from the DeepL -mentawai1853 |> -mutate(eng_diff = English_DeepL != English_DeepL2) |> -filter(eng_diff) |> -select(English_DeepL, English_DeepL2, Dutch) |> -as.data.frame() -# read the Concepticon mapping file -concepticon <- read_tsv("data/mentawai-gloss-mapped-to-edit_1853-270.tsv") |> -filter(GLOSS != "#") |> -select(-NUMBER) |> -distinct() |> -mutate(across(where(is.character), ~replace_na(., ""))) -# join the Concepticon mapping with the main word list table -mentawai1853 <- mentawai1853 |> -left_join(concepticon |> -rename(English_DeepL2 = GLOSS)) -# exclude the irrelevant columns -mentawai1853 <- mentawai1853 |> -select(-untranslated, -eng_diff, -English_DeepL) |> -select(ID, Mentawai, Dutch, English = English_DeepL2, CONCEPTICON_GLOSS, CONCEPTICON_ID, everything()) -# save the main word list data containing the English translation and Concepticon Mapping -## the saved files still combine the three categories of word list -mentawai1853 |> -write_tsv("data/mentawai1853.tsv") -mentawai1853 |> -write_rds("data/mentawai1853.rds") -mentawai1853 +ipa <- qlcData::tokenize(mentawai1853$Mentawai, +profile = "data/ortho-profile-mentawai1853.tsv", +transliterate = "Phoneme", +sep.replace = "_", +regex = TRUE, +ordering = NULL) +ipa$strings |> +rename(Mentawai = originals, +IPA = transliterated) |> +as_tibble() |> +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised IPA transcription +mutate(IPAnotSegmented = str_replace_all(IPA, " ", "")) +ipa <- qlcData::tokenize(mentawai1853$Mentawai, +profile = "data/ortho-profile-mentawai1853.tsv", +transliterate = "Phoneme", +sep.replace = "_", +regex = TRUE, +ordering = NULL) +ipa$strings |> +rename(Mentawai = originals, +IPA = transliterated) |> +as_tibble() +qlcData::tokenize(mentawai1853$Mentawai, +profile = "data/ortho-profile-mentawai1853.tsv", +transliterate = "Phoneme", +sep.replace = "_", +regex = TRUE, +ordering = NULL) +qlcData::tokenize(mentawai1853$Mentawai, +profile = "data/ortho-profile-mentawai1853.tsv", +transliterate = "Replacement", +sep.replace = "_", +regex = TRUE, +ordering = NULL) ortho <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Replacement", @@ -260,129 +215,37 @@ ortho$strings <- ortho$strings |> rename(Mentawai = originals, Commons = transliterated) |> as_tibble() |> -mutate(ID = mentawai1853$ID) +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised common transcription +mutate(CommonsNotSegmented = str_replace_all(Commons, " ", ""), +CommonsNotSegmented = str_replace_all(CommonsNotSegmented, "_", " ")) ipa <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Phoneme", sep.replace = "_", regex = TRUE, ordering = NULL) -ipa$strings <- ipa$strings |> +ipa$strings +ipa <- qlcData::tokenize(mentawai1853$Mentawai, +profile = "data/ortho-profile-mentawai1853.tsv", +transliterate = "Phoneme", +sep.replace = "#", +regex = TRUE, +ordering = NULL) +ipa$strings |> rename(Mentawai = originals, IPA = transliterated) |> as_tibble() |> mutate(ID = mentawai1853$ID) -mentawai1853 <- mentawai1853 |> -left_join(ortho$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) |> -left_join(ipa$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) |> -select(ID, Mentawai, Commons, IPA, Dutch, English, everything()) -# split the comparison dialect word list and men's names and save the respective files -# mentawai1853 <- read_rds("data/mentawai1853.rds") -mentawai1853 |> -filter(Category == "word list") |> -write_tsv("data/mentawai1853.tsv") -mentawai1853 |> -filter(Category == "word list") |> -write_tsv("data/mentawai1853.rds") -mentawai1853 |> -filter(Category == "dialect-comparison") |> -write_tsv("data/mentawai1853-dialect-comparison.tsv") -mentawai1853 |> -filter(Category == "men's names") |> -write_tsv("data/mentawai1853-men-names.tsv") -library(tidyverse) -library(googlesheets4) -library(deeplr) -library(googleLanguageR) -# get the Google Sheet link -source("codes/mentawai1853_00-gsheet.R") -# read the Google Sheet file -mentawai1853 <- read_sheet(mentawai1853_sheet) |> -mutate(ID = row_number()) # give entry ID -library(tidyverse) -library(googlesheets4) -library(deeplr) -library(googleLanguageR) -# get the Google Sheet link -source("codes/mentawai1853_00-gsheet.R") -# read the Google Sheet file -mentawai1853 <- read_sheet(mentawai1853_sheet) |> -mutate(ID = row_number()) # give entry ID -read_sheet(mentawai1853_sheet) |> -mutate(ID = row_number()) -library(tidyverse) -library(googlesheets4) -library(deeplr) -library(googleLanguageR) -# get the Google Sheet link -source("codes/mentawai1853_00-gsheet.R") -# read the Google Sheet file -mentawai1853 <- read_sheet(mentawai1853_sheet) |> -mutate(ID = row_number()) # give entry ID -# translate the Dutch gloss with deeplr <- CODE to translate the Dutch into English using DeepL API -# dutch <- mentawai1853 |> -# select(ID, Dutch) |> -# filter(!is.na(Dutch)) -# dutch <- dutch |> -# mutate(English_DeepL2 = deeplr::translate2(text = Dutch, target_lang = "EN", source_lang = "NL", auth_key = deeplauthkey)) -# write_tsv(dutch, "data/dutch-translated-to-english-with-deepl.tsv") <- This file is manually edited together with the edited Concepticon mapping file -# read the English translation of the Dutch gloss -dutch <- read_tsv("data/dutch-translated-to-english-with-deepl.tsv") |> -mutate(untranslated = Dutch == English_DeepL2) -# join the English translation of the Dutch gloss with the main word list table -mentawai1853 <- mentawai1853 |> -left_join(dutch) |> -mutate(English_DeepL2 = if_else(untranslated, English_DeepL, English_DeepL2)) |> -mutate(eng_diff = English_DeepL != English_DeepL2) -# this is a testing code to check which original translation is different from the DeepL -mentawai1853 |> -mutate(eng_diff = English_DeepL != English_DeepL2) |> -filter(eng_diff) |> -select(English_DeepL, English_DeepL2, Dutch) |> -as.data.frame() -# read the Concepticon mapping file -concepticon <- read_tsv("data/mentawai-gloss-mapped-to-edit_1853-270.tsv") |> -filter(GLOSS != "#") |> -select(-NUMBER) |> -distinct() |> -mutate(across(where(is.character), ~replace_na(., ""))) -# join the Concepticon mapping with the main word list table -mentawai1853 <- mentawai1853 |> -left_join(concepticon |> -rename(English_DeepL2 = GLOSS)) -# exclude the irrelevant columns -mentawai1853 <- mentawai1853 |> -select(-untranslated, -eng_diff, -English_DeepL) |> -select(ID, Mentawai, Dutch, English = English_DeepL2, CONCEPTICON_GLOSS, CONCEPTICON_ID, everything()) -# save the main word list data containing the English translation and Concepticon Mapping -## the saved files still combine the three categories of word list -mentawai1853 |> -write_tsv("data/mentawai1853.tsv") -mentawai1853 |> -write_rds("data/mentawai1853.rds") -# save the comparison table in p. 434 into .tsv -# read_tsv("data/vrosenberg1853p434.csv") |> -# write_tsv("data/vrosenberg1853p434.tsv") -# run orthography profile ===== -# write the profile and manually edit it. -# mentawai1853$Mentawai |> -# qlcData::write.profile(normalize = "NFC", -# editing = TRUE, -# info = TRUE, -# file.out = "data/ortho-profile-mentawai1853.tsv") -ortho <- qlcData::tokenize(mentawai1853$Mentawai, +ipa <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", -transliterate = "Replacement", +transliterate = "Phoneme", sep.replace = "_", regex = TRUE, ordering = NULL) -ortho$strings <- ortho$strings |> +ipa$strings |> rename(Mentawai = originals, -Commons = transliterated) |> +IPA = transliterated) |> as_tibble() |> mutate(ID = mentawai1853$ID) ipa <- qlcData::tokenize(mentawai1853$Mentawai, @@ -391,27 +254,20 @@ transliterate = "Phoneme", sep.replace = "_", regex = TRUE, ordering = NULL) -ipa$strings <- ipa$strings |> +ipa$strings |> rename(Mentawai = originals, IPA = transliterated) |> as_tibble() |> mutate(ID = mentawai1853$ID) -mentawai1853 <- mentawai1853 |> -left_join(ortho$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) |> -left_join(ipa$strings |> -select(-tokenized, -Mentawai), -by = join_by(ID)) |> -select(ID, Mentawai, Commons, IPA, Dutch, English, everything()) -mentawai1853 -library(tidyverse) -library(googlesheets4) -library(deeplr) -library(googleLanguageR) -# get the Google Sheet link -source("codes/mentawai1853_00-gsheet.R") -# read the Google Sheet file +ipa$strings <- ipa$strings |> +rename(Mentawai = originals, +IPA = transliterated) |> +as_tibble() |> +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised IPA transcription +mutate(IPAnotSegmented = str_replace_all(IPA, " ", ""), +IPAnotSegmented = str_replace_all(IPAnotSegmented, "_", " ")) +ipa$strings mentawai1853 <- read_sheet(mentawai1853_sheet) |> mutate(ID = row_number()) # give entry ID # translate the Dutch gloss with deeplr <- CODE to translate the Dutch into English using DeepL API @@ -475,7 +331,10 @@ ortho$strings <- ortho$strings |> rename(Mentawai = originals, Commons = transliterated) |> as_tibble() |> -mutate(ID = mentawai1853$ID) +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised common transcription +mutate(CommonsNotSegmented = str_replace_all(Commons, " ", ""), +CommonsNotSegmented = str_replace_all(CommonsNotSegmented, "_", " ")) ipa <- qlcData::tokenize(mentawai1853$Mentawai, profile = "data/ortho-profile-mentawai1853.tsv", transliterate = "Phoneme", @@ -486,7 +345,10 @@ ipa$strings <- ipa$strings |> rename(Mentawai = originals, IPA = transliterated) |> as_tibble() |> -mutate(ID = mentawai1853$ID) +mutate(ID = mentawai1853$ID) |> +# add the non-tokenised IPA transcription +mutate(IPAnotSegmented = str_replace_all(IPA, " ", ""), +IPAnotSegmented = str_replace_all(IPAnotSegmented, "_", " ")) mentawai1853 <- mentawai1853 |> left_join(ortho$strings |> select(-tokenized, -Mentawai), @@ -494,14 +356,20 @@ by = join_by(ID)) |> left_join(ipa$strings |> select(-tokenized, -Mentawai), by = join_by(ID)) |> -select(ID, Mentawai, Commons, IPA, Dutch, English, everything()) |> +select(ID, Mentawai, Commons, CommonsNotSegmented, IPA, IPAnotSegmented, Dutch, English, everything()) |> # add the doculect for the CLDF purpose -mutate(Doculect = "mentawai") +mutate(Doculect = "Mentawai") +mentawai1853 +# split the comparison dialect word list and men's names and save the respective files +# mentawai1853 <- read_rds("data/mentawai1853.rds") mentawai1853 |> filter(Category == "word list") |> write_tsv("data/mentawai1853.tsv") mentawai1853 |> filter(Category == "word list") |> +write_tsv("raw/mentawai1853.tsv") +mentawai1853 |> +filter(Category == "word list") |> write_tsv("data/mentawai1853.rds") mentawai1853 |> filter(Category == "dialect-comparison") |> @@ -510,3 +378,135 @@ mentawai1853 |> filter(Category == "men's names") |> write_tsv("data/mentawai1853-men-names.tsv") mentawai1853 +mentawai1853 +mentawai1853 |> filter(CONCEPTICON_GLOSS == NA) +mentawai1853 |> filter(is.na(CONCEPTICON_GLOSS)) +mentawai1853 |> +filter(Category == "word list") |> filter(is.na(CONCEPTICON_GLOSS)) +## for CLDF raw directory +mentawai1853 |> +filter(Category == "word list") |> +mutate(CONCEPTICON_ID = replace(CONCEPTICON_ID, CONCEPTICON_ID == 0, "")) |> +write_tsv("raw/mentawai1853.tsv") +df <- read_csv("cldf/forms.csv") +df +df |> select(Value) +df |> select(Value) +df +df |> select(Value, Form, Segments) +df <- read_csv("cldf/forms.csv") +df |> select(Value, Form, Segments) +df <- read_csv("cldf/forms.csv") +df |> select(Value, Form) +df <- read_csv("cldf/forms.csv") +df +read_tsv("data/ortho-profile-mentawai1853.tsv") +read_tsv("data/ortho-profile-mentawai1853.tsv") |> select(Grapheme = Replacement, IPA = Phoneme) +read_tsv("data/ortho-profile-mentawai1853.tsv") |> select(Grapheme = Replacement, IPA = Phoneme) |> as.data.frame() +read_tsv("data/ortho-profile-mentawai1853.tsv", na = " ") |> select(Grapheme = Replacement, IPA = Phoneme) |> as.data.frame() +read_tsv("data/ortho-profile-mentawai1853.tsv", na = "") |> select(Grapheme = Replacement, IPA = Phoneme) |> as.data.frame() +read_tsv("data/ortho-profile-mentawai1853.tsv", na = c("NA", " ")) |> select(Grapheme = Replacement, IPA = Phoneme) |> as.data.frame() +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +select(Grapheme = Replacement, IPA = Phoneme) |> +mutate(across(where(is.character), ~replace_na(., " "))) +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +select(Grapheme = Replacement, IPA = Phoneme) |> +mutate(across(where(is.character), ~replace_na(., " "))) |> as.data.frame() +dir.exists("etc") +# for CLDF etc folder to include the orthography profile +## assumming we already have the IPA match of the grapheme as well! AND the `etc` folder exists +if (dir.exists("etc")) { +message("The `etc` directory exists.") +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +select(Grapheme = Replacement, IPA = Phoneme) |> +mutate(across(where(is.character), ~replace_na(., " "))) |> +write_tsv("etc/orthography.tsv", na = "") +} else { +warning("No `etc` directory is detected! Create one.") +} +# for CLDF etc folder to include the orthography profile +## assumming we already have the IPA match of the grapheme as well! AND the `etc` folder exists +if (dir.exists("etc")) { +message("The `etc` directory exists.") +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +select(Grapheme = Replacement, IPA = Phoneme) |> +mutate(across(where(is.character), ~replace_na(., " "))) |> +write_tsv("etc/orthography.tsv", na = "") +message("Save the orthography profile into `etc`.") +} else { +warning("No `etc` directory is detected! Create one.") +} +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +select(Grapheme = Replacement, IPA = Phoneme) |> +mutate(across(where(is.character), ~replace_na(., " "))) |> +distinct() +# for CLDF etc folder to include the orthography profile +## assumming we already have the IPA match of the grapheme as well! AND the `etc` folder exists +if (dir.exists("etc")) { +message("The `etc` directory exists.") +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +select(Grapheme = Replacement, IPA = Phoneme) |> +mutate(across(where(is.character), ~replace_na(., " "))) |> +distinct() |> +write_tsv("etc/orthography.tsv", na = "") +message("Save the orthography profile into `etc`.") +} else { +warning("No `etc` directory is detected! Create one.") +} +read_tsv("data/ortho-profile-mentawai1853.tsv") +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +# select(Grapheme = Replacement, IPA = Phoneme) |> +# mutate(across(where(is.character), ~replace_na(., " "))) |> +# distinct() |> +mutate(Grapheme = if_else(!is.na(Right), +paste(Grapheme, Right, sep = ""), +Grapheme), +Grapheme = if_else(!is.na(Left), +paste(Left, Grapheme, sep = ""), +Grapheme)) +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +# select(Grapheme = Replacement, IPA = Phoneme) |> +# mutate(across(where(is.character), ~replace_na(., " "))) |> +# distinct() |> +mutate(Grapheme = if_else(!is.na(Right), +paste(Grapheme, Right, sep = ""), +Grapheme), +Grapheme = if_else(!is.na(Left), +paste(Left, Grapheme, sep = ""), +Grapheme)) |> +select(Grapheme, Phoneme) |> +as.data.frame() +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +# select(Grapheme = Replacement, IPA = Phoneme) |> +# mutate(across(where(is.character), ~replace_na(., " "))) |> +# distinct() |> +mutate(Grapheme = if_else(!is.na(Right), +paste(Grapheme, Right, sep = ""), +Grapheme), +Grapheme = if_else(!is.na(Left), +paste(Left, Grapheme, sep = ""), +Grapheme)) |> +select(Grapheme, Phoneme) |> +mutate(across(where(is.character), ~replace_na(., " "))) |> +write_tsv("etc/orthography.tsv", na = "") +# for CLDF etc folder to include the orthography profile +## assumming we already have the IPA match of the grapheme as well! AND the `etc` folder exists +if (dir.exists("etc")) { +message("The `etc` directory exists.") +read_tsv("data/ortho-profile-mentawai1853.tsv") |> +# select(Grapheme = Replacement, IPA = Phoneme) |> +# mutate(across(where(is.character), ~replace_na(., " "))) |> +# distinct() |> +mutate(Grapheme = if_else(!is.na(Right), +paste(Grapheme, Right, sep = ""), +Grapheme), +Grapheme = if_else(!is.na(Left), +paste(Left, Grapheme, sep = ""), +Grapheme)) |> +select(Grapheme, Phoneme) |> +mutate(across(where(is.character), ~replace_na(., " "))) |> +write_tsv("etc/orthography.tsv", na = "") +message("Save the orthography profile into `etc`.") +} else { +warning("No `etc` directory is detected! Create one.") +} diff --git a/.zenodo.json b/.zenodo.json index 0912330..993a81d 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -7,20 +7,18 @@ ], "creators": [ { - "name": "Carl Benjamin Hermann von Rosenberg" - } - ], - "contributors": [ - { - "name": "Gede Primahadi W. Rajeg", - "type": "Other" + "name": "Gede Primahadi W. Rajeg" } ], + "contributors": [], "communities": [ { "identifier": "lexibank" } ], "upload_type": "dataset", - "description": "

Cite the source of the dataset as:

\n\n
\n

Rosenberg, Carl Benjamin Hermann von. 1853. De Mentawei-Eilanden en Hunne Bewoners. Tijdschrift voor Indische Taal-, Land- en Volkenkunde 1. 403\u2013440.

\n
" + "description": "

Cite the source of the dataset as:

\n\n
\n

Rosenberg, Carl Benjamin Hermann von. 1853. De Mentawei-Eilanden en Hunne Bewoners. Tijdschrift voor Indische Taal-, Land- en Volkenkunde 1. 403\u2013440.

\n
", + "license": { + "id": "CC-BY-NC-SA-4.0" + } } \ No newline at end of file diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 2ace559..30b55ee 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -2,5 +2,4 @@ Name | GitHub user | Description | Role --- | --- | --- | --- -Carl Benjamin Hermann von Rosenberg | | | Author -Gede Primahadi W. Rajeg | @gederajeg | maintainer, CLDF conversion, Concepticon mapping, Orthography profiling | Other \ No newline at end of file +Gede Primahadi W. Rajeg | @gederajeg | digitisation, code, CLDF conversion, Concepticon mapping, Orthography profiling | Maintainer \ No newline at end of file diff --git a/LICENSE b/LICENSE index cbe5ad1..e30f336 100644 --- a/LICENSE +++ b/LICENSE @@ -33,7 +33,7 @@ exhaustive, and do not form part of our licenses. material not subject to the license. This includes other CC- licensed material, or material used under an exception or limitation to copyright. More considerations for licensors: - wiki.creativecommons.org/Considerations_for_licensors + wiki.creativecommons.org/Considerations_for_licensors Considerations for the public: By using one of our public licenses, a licensor grants the public permission to use the @@ -49,8 +49,8 @@ exhaustive, and do not form part of our licenses. such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. More considerations - for the public: - wiki.creativecommons.org/Considerations_for_licensees + for the public: + wiki.creativecommons.org/Considerations_for_licensees ======================================================================= @@ -435,3 +435,4 @@ the avoidance of doubt, this paragraph does not form part of the public licenses. Creative Commons may be contacted at creativecommons.org. + diff --git a/README.md b/README.md index 891082a..3d4bd89 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # CLDF dataset derived from von Rosenberg's "De Mentawei-Eilanden en Hunne Bewoners" from 1853 + +[![CLDF validation](https://github.com/complexico/mentawai-word-list-1853/workflows/CLDF-validation/badge.svg)](https://github.com/complexico/mentawai-word-list-1853/actions?query=workflow%3ACLDF-validation) + + ## How to cite If you use these data please cite @@ -10,10 +14,17 @@ If you use these data please cite ## Description -This dataset is licensed under a CC-BY-NC-SA 4.0 license +This dataset is licensed under a https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en license Available online at https://www.digitale-sammlungen.de/en/view/bsb10433845?page=450,451 +## Notes + +Based on the [Rights Statement](https://www.digitale-sammlungen.de/en/details/bsb10433845) (presented down below in that page), this digitised journal has a [No Copyright-Non-commercial use only](https://rightsstatements.org/page/NoC-NC/1.0/?language=en) condition. + +Before the CLDF conversion using Python, the materials in this repository (inside the [data](https://github.com/complexico/mentawai-word-list-1853/tree/main/data) directory) were processed using R as an RStudio project (the R scripts are in the [codes](https://github.com/complexico/mentawai-word-list-1853/tree/main/codes) directory). The English gloss of the Dutch was generated via the DeepL translator using the [`deeplr` R package](https://cran.r-project.org/package=deeplr). + + ## Statistics @@ -37,8 +48,7 @@ Available online at https://www.digitale-sammlungen.de/en/view/bsb10433845?page= Name | GitHub user | Description | Role --- | --- | --- | --- -Carl Benjamin Hermann von Rosenberg | | | Author -Gede Primahadi W. Rajeg | @gederajeg | maintainer, CLDF conversion, Concepticon mapping, Orthography profiling | Other +Gede Primahadi W. Rajeg | @gederajeg | digitisation, code, CLDF conversion, Concepticon mapping, Orthography profiling | Maintainer diff --git a/cldf/.transcription-report.json b/cldf/.transcription-report.json index c3a27e0..e57c7b1 100644 --- a/cldf/.transcription-report.json +++ b/cldf/.transcription-report.json @@ -2,9 +2,9 @@ "by_language": { "Mentawai": { "bipa_errors": [ + "t\u0361\u0292", "<<\u00eb>>", - "<<->>", - "t\u0361\u0292" + "<<->>" ], "general_errors": 18, "replacements": { @@ -100,9 +100,9 @@ ] }, "sclass_errors": [ + "t\u0361\u0292", "<<\u00eb>>", - "<<->>", - "t\u0361\u0292" + "<<->>" ], "segments": { "+": 1, @@ -251,9 +251,9 @@ ], "bad_words_count": 15, "bipa_errors": [ + "t\u0361\u0292", "<<\u00eb>>", - "<<->>", - "t\u0361\u0292" + "<<->>" ], "general_errors": 18, "invalid_words": [], @@ -352,9 +352,9 @@ ] }, "sclass_errors": [ + "t\u0361\u0292", "<<\u00eb>>", - "<<->>", - "t\u0361\u0292" + "<<->>" ], "segments": { "+": 1, diff --git a/cldf/README.md b/cldf/README.md index 9b1e86d..f7ff110 100644 --- a/cldf/README.md +++ b/cldf/README.md @@ -11,9 +11,9 @@ property | value [dc:bibliographicCitation](http://purl.org/dc/terms/bibliographicCitation) | Rosenberg, Carl Benjamin Hermann von. 1853. De Mentawei-Eilanden en Hunne Bewoners. Tijdschrift voor Indische Taal-, Land- en Volkenkunde 1. 403–440. [dc:conformsTo](http://purl.org/dc/terms/conformsTo) | [CLDF Wordlist](http://cldf.clld.org/v1.0/terms.rdf#Wordlist) [dc:identifier](http://purl.org/dc/terms/identifier) | https://www.digitale-sammlungen.de/en/view/bsb10433845?page=450,451 -[dc:license](http://purl.org/dc/terms/license) | CC-BY-NC-SA 4.0 +[dc:license](http://purl.org/dc/terms/license) | https://creativecommons.org/licenses/by-nc-sa/4.0/ [dcat:accessURL](http://www.w3.org/ns/dcat#accessURL) | git@github.com:complexico/mentawai-word-list-1853 -[prov:wasDerivedFrom](http://www.w3.org/ns/prov#wasDerivedFrom) |
  1. git@github.com:complexico/mentawai-word-list-1853 774ff3f
  2. Glottolog glottolog-glottolog-d9da5e2
  3. Concepticon v3.1.0-19-g7c0b6ae3
  4. CLTS cldf-clts-clts-6dc73af
+[prov:wasDerivedFrom](http://www.w3.org/ns/prov#wasDerivedFrom) |
  1. git@github.com:complexico/mentawai-word-list-1853 1e036fa
  2. Glottolog glottolog-glottolog-d9da5e2
  3. Concepticon v3.1.0-19-g7c0b6ae3
  4. CLTS cldf-clts-clts-6dc73af
[prov:wasGeneratedBy](http://www.w3.org/ns/prov#wasGeneratedBy) |
  1. lingpy-rcParams: lingpy-rcParams.json
  2. python: 3.9.6
  3. python-packages: requirements.txt
[rdf:ID](http://www.w3.org/1999/02/22-rdf-syntax-ns#ID) | barrier-islands-mentawai-wlist1853 [rdf:type](http://www.w3.org/1999/02/22-rdf-syntax-ns#type) | http://www.w3.org/ns/dcat#Distribution diff --git a/cldf/cldf-metadata.json b/cldf/cldf-metadata.json index 7152ee1..20bbe46 100644 --- a/cldf/cldf-metadata.json +++ b/cldf/cldf-metadata.json @@ -5,7 +5,7 @@ "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#Wordlist", "dc:identifier": "https://www.digitale-sammlungen.de/en/view/bsb10433845?page=450,451", "dc:isVersionOf": null, - "dc:license": "CC-BY-NC-SA 4.0", + "dc:license": "https://creativecommons.org/licenses/by-nc-sa/4.0/", "dc:related": null, "dc:source": "sources.bib", "dc:title": "CLDF dataset derived from von Rosenberg's \"De Mentawei-Eilanden en Hunne Bewoners\" from 1853", @@ -14,7 +14,7 @@ { "rdf:about": "git@github.com:complexico/mentawai-word-list-1853", "rdf:type": "prov:Entity", - "dc:created": "774ff3f", + "dc:created": "1e036fa", "dc:title": "Repository" }, { diff --git a/cldf/lingpy-rcParams.json b/cldf/lingpy-rcParams.json index 45b34df..5c1e7ff 100644 --- a/cldf/lingpy-rcParams.json +++ b/cldf/lingpy-rcParams.json @@ -123,7 +123,7 @@ "scorer": {}, "sonar": true, "stress": "\u02c8\u02cc'", - "timestamp": "2024-07-06 11:25", + "timestamp": "2024-07-06 14:58", "tones": "\u00b9\u00b2\u00b3\u2074\u2075\u2076\u2077\u2078\u2079\u2070\u2081\u2082\u2083\u2084\u2085\u2086\u2087\u2088\u2089\u20800123456789\u02e5\u02e6\u02e7\u02e8\u02e9\u02ea\u02eb-\ua708-\ua709-\ua70a-\ua70b-\ua70c-\ua70d-\ua70e-\ua70f-\ua710-\ua711-\ua712-\ua713-\ua714-\ua715-\ua716-\ua717-\ua718-\ua719-\ua71a-\ua700-\ua701-\ua702-\ua703-\ua704-\ua705-\ua706-\ua707", "tree_calc": "neighbor", "unique_sequences": true, diff --git a/metadata.json b/metadata.json index 0626063..38a1c0e 100644 --- a/metadata.json +++ b/metadata.json @@ -2,7 +2,7 @@ "id": "barrier-islands-mentawai-wlist1853", "title": "CLDF dataset derived from von Rosenberg's \"De Mentawei-Eilanden en Hunne Bewoners\" from 1853", "description": null, - "license": "CC-BY-NC-SA 4.0", + "license": "https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en", "url": "https://www.digitale-sammlungen.de/en/view/bsb10433845?page=450,451", "citation": "Rosenberg, Carl Benjamin Hermann von. 1853. De Mentawei-Eilanden en Hunne Bewoners. Tijdschrift voor Indische Taal-, Land- en Volkenkunde 1. 403\u2013440." } \ No newline at end of file diff --git a/tutorial-step-notes-to-create-the-cldf.sh b/tutorial-step-notes-to-create-the-cldf.sh index 0b816a7..b296c0d 100644 --- a/tutorial-step-notes-to-create-the-cldf.sh +++ b/tutorial-step-notes-to-create-the-cldf.sh @@ -59,7 +59,7 @@ tree -v --charset utf-8 cldfbench lexibank.makecldf cldfbench_barrier-islands-mentawai-wlist1853.py --glottolog "/Users/Primahadi/Documents/cldf_project/glottolog-glottolog-d9da5e2" --concepticon "/Users/Primahadi/Documents/cldf_project/concepticon/concepticon-data" --clts "/Users/Primahadi/Documents/cldf_project/cldf-clts-clts-6dc73af" # to create an orthography profile (with a guess to possible IPA form/phoneme) from the Form col. in cldf/forms.csv using pylexibank (cf. List (2021: section 6)): https://calc.hypotheses.org/2954 -cldfbench lexibank.init_profile cldfbench_barrier-islands-mentawai-wlist1853.py --clts "/Users/Primahadi/Documents/cldf_project/cldf-clts-clts-6dc73af" +# cldfbench lexibank.init_profile cldfbench_barrier-islands-mentawai-wlist1853.py --clts "/Users/Primahadi/Documents/cldf_project/cldf-clts-clts-6dc73af" ## note on orthography workflow - # we could add an orthography profile file (orthography.tsv) in `etc` directory that we previously created using qlcData and manually edited (## ensure we already have the IPA match of the grapheme as well!)