Marius Bottin 2023-04-19
- 1 Probar la taxonomía en los datos DarwinCore
- 2 Buscar los errores de taxonomía
- 4 Analyse of the taxonomic database
- 5 Dar codigos taxonomicos a cada fila de la tabla taxonomica total
- 6 Manejo de las morfo especies
- 7 CREATING sql taxonomic function
knitr::opts_chunk$set(tidy.opts = list(width.cutoff = 70), tidy = TRUE, connection="fracking_db")
require(openxlsx)
require(RPostgreSQL)
fracking_db <- dbConnect(PostgreSQL(), dbname = "fracking")
load("dataGrupos.RData")
names_gp_sheets <- lapply(dataGrupos, names)
DF_gp_sheets <- data.frame(gp_biol = rep(names(names_gp_sheets), sapply(names_gp_sheets,
length)), sheet = Reduce(c, names_gp_sheets))
DF_gp_sheets$registro <- grepl("registro", DF_gp_sheets$sheet)
column_registros <- apply(DF_gp_sheets[DF_gp_sheets$registro, ], 1, function(x,
l) {
colnames(l[[x[1]]][[x[2]]])
}, l = dataGrupos)
# sort(table(Reduce(c,column_registros)),decreasing = T)
taxonomic_col <- c("kingdom", "phylum", "class", "order", "family", "genus",
"scientificName", "specificEpithet", "taxonRank", "vernacularName",
"scientificNameAuthorship", "identificationQualifier", "identificationRemarks",
"higherClassification")
# lapply(column_registros,function(x,c)c[!c%in%x],c=taxonomic_col)
colnames(dataGrupos$Hidrobiologico$registros_fitoplancton)[colnames(dataGrupos$Hidrobiologico$registros_fitoplancton) ==
"Family"] <- "family"
Extraer una tabla total de taxonomía
taxonomicTotal <- Reduce(rbind, apply(DF_gp_sheets[DF_gp_sheets$registro,
], 1, function(x, l, c) {
tabReg <- l[[x[1]]][[x[2]]]
MissingVar <- c[!c %in% colnames(tabReg)]
if (length(MissingVar) > 0) {
MissingTab <- as.data.frame(matrix(NA, nrow = nrow(tabReg), ncol = length(MissingVar)))
colnames(MissingTab) <- MissingVar
tabReg <- data.frame(tabReg, MissingTab, row.names = NULL)
}
data.frame(gp = x[1], sheet = x[2], tabReg[, c], row.names = NULL)
}, l = dataGrupos, c = taxonomic_col))
Reemplazar vacíos por NA
taxonomicTotal[which(taxonomicTotal == "", arr.ind = T)] <- NA
Suprimir los “trailing whitespace”
for (i in 1:ncol(taxonomicTotal)) {
taxonomicTotal[, i] <- trimws(taxonomicTotal[, i])
}
Los nombres taxonomico deben empezar por una mayuscula (al menos en la primera palabra)
mayu1_col <- c("kingdom", "phylum", "class", "order", "family", "genus",
"scientificName")
for (i in 1:length(mayu1_col)) {
taxonomicTotal[, mayu1_col[i]] <- gsub("^([a-z])", "\\U\\1", taxonomicTotal[,
mayu1_col[i]], perl = T)
}
dim(taxonomicTotal)
## [1] 129654 16
taxonomicTotal_un <- unique(taxonomicTotal)
dim(taxonomicTotal_un)
## [1] 5490 16
Lo que hacemos acá es, para cada caso de cada nivel taxonomico, averiguar que el nivel superior sea siempre lo mismo:
toTest <- c("kingdom", "phylum", "class", "order", "family", "genus", "scientificName")
error_diffSup <- list()
for (i in 1:length(toTest)) {
if (i > 1) {
sup <- tapply(taxonomicTotal_un[, toTest[i - 1]], taxonomicTotal_un[toTest[i]],
unique, simplify = F)
if (i > 2) {
sup <- tapply(dplyr::coalesce(taxonomicTotal_un[, toTest[i -
1]], taxonomicTotal_un[, toTest[i - 2]]), taxonomicTotal_un[toTest[i]],
unique, simplify = F)
}
nb_sup <- sapply(sup, length)
error_diffSup[[toTest[i]]] <- sup[nb_sup > 1]
}
}
Mostrar los resultados:
for (i in 1:length(error_diffSup)) {
nivel = names(error_diffSup)[i]
if (!length(error_diffSup[[i]])) {
next
}
for (j in 1:length(error_diffSup[[i]])) {
taxon_inf <- names(error_diffSup[[i]])[j]
tabConcerned <- taxonomicTotal[taxonomicTotal[, nivel] == taxon_inf,
]
cat("El taxon \"", taxon_inf, "\" (nivel:", nivel, ", presente en los grupos: ",
paste(unique(na.omit(tabConcerned$gp)), collapse = ", "), ")\n se encuentra con los taxones superiores siguientes:",
paste(error_diffSup[[i]][[j]], collapse = ", "), "\n\n")
}
}
El taxon ” Sphyrotheca ” (nivel: scientificName , presente en los grupos: Collembolos ) se encuentra con los taxones superiores siguientes: Sphyrotheca , Sphyrotheca