diff --git a/R/clean_names.R b/R/clean_names.R index 3f112c07..6c286dd6 100644 --- a/R/clean_names.R +++ b/R/clean_names.R @@ -4,6 +4,7 @@ + #' Find taxonomic alignments for a list of names to a version of the Australian Plant Census (APC) through standardizing formatting and checking for spelling issues #' #' This function uses Australian Plant Census (APC) & the Australian Plant Name Index (APNI) to find taxonomic alignments for a list of names. @@ -76,7 +77,7 @@ align_taxa <- function(original_name, dplyr::bind_rows( taxa_raw, tibble::tibble( - original_name = subset(original_name,!original_name %in% taxa_raw$original_name) %>% unique(), + original_name = subset(original_name, !original_name %in% taxa_raw$original_name) %>% unique(), cleaned_name = NA_character_, stripped_name = NA_character_, stripped_name2 = NA_character_, @@ -486,10 +487,11 @@ standardise_names <- function(taxon_names) { #' #' @param taxa A list of Australian plant species that needs to be reconciled with current taxonomy. #' @param stable_or_current_data either "stable" for a consistent version, or "current" for the leading edge version. -#' @param version The version number of the dataset to use. -#' @param one_to_many How to handle one_to_many taxonomic matches. Default is "return_all". The other options are "collapse_to_higher_taxon" and "most likely species". Most likely species defaults to the original_name if that name is accepted by the APC. +#' @param version The version number of the dataset to use. +#' @param one_to_many How to handle one_to_many taxonomic matches. Default is "return_all". The other options are "collapse_to_higher_taxon" and "most_likely_species". most_likely_species defaults to the original_name if that name is accepted by the APC; this will be right for certain species subsets, but make errors in other cases, use with caution. #' @param full logical for whether the full lookup table is returned or just the two key columns -#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} seperately and pass the data in. +#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in. +#' @param output file path to save the intermediate output to #' @return A lookup table containing the original species names, the aligned species names, and additional taxonomic information such as taxon IDs and genera. #' @export #' @@ -502,136 +504,135 @@ standardise_names <- function(taxon_names) { #' "Not a species"), #' resources=resources) #' -create_taxonomic_update_lookup <- - function(taxa, - stable_or_current_data = "stable", - version = default_version(), - one_to_many = "return_all", - full = FALSE, - resources = load_taxonomic_resources(stable_or_current_data = - stable_or_current_data, version = version)) { - valid_inputs <- - c("return_all", - "collapse_to_higher_taxon", - "most_likely_species") - if (!one_to_many %in% valid_inputs) - stop( - paste( - "Invalid input:", - input_char, - ". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'." - ) - ) - - aligned_data <- - unique(taxa) %>% - align_taxa(resources = resources) - - aligned_species_list_tmp <- - aligned_data$aligned_name %>% update_taxonomy(resources = resources) - - #should really be a function, but i'm not smart enough to see how to handle the outputs being different and the early return - if (one_to_many %in% c("return_all", "collapse_to_higher_taxon")) { - aligned_species_list <- - aligned_data %>% dplyr::select(original_name, aligned_name, aligned_reason) %>% - dplyr::left_join(aligned_species_list_tmp, - by = c("aligned_name"), - multiple = "all") %>% - dplyr::filter(!is.na(taxonIDClean)) %>% - dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>% - dplyr::rename(canonical_name = canonicalName) - } - - if (one_to_many == c("most_likely_species")) { - aligned_species_list <- - aligned_data %>% dplyr::select(original_name, aligned_name, aligned_reason) %>% - dplyr::left_join(aligned_species_list_tmp, - by = c("aligned_name"), - multiple = "first") %>% - dplyr::filter(!is.na(taxonIDClean)) %>% - dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>% - dplyr::rename(canonical_name = canonicalName) - } - - if (one_to_many == "collapse_to_higher_taxon") { - aligned_species_list %>% - group_by(original_name, aligned_name) %>% - summarise( - apc_names = find_mrct(canonical_name, resources = resources), - aligned_reason = paste(unique(aligned_reason), collapse = " and "), - taxonomicStatus = paste(unique(taxonomicStatusClean), collapse = " and "), - source = paste(unique(source), collapse = " and "), - number_of_collapsed_taxa = n() - ) -> test - return(test) - } - - if (full == TRUE) { - return(aligned_species_list) - } - if (full == FALSE) { - return( - dplyr::select( - aligned_species_list, - original_name, - aligned_name, - apc_name = canonical_name, - aligned_reason, - taxonomic_status_of_aligned_name = taxonomicStatusClean - ) %>% - distinct() #may not be necessary - ) - } +create_taxonomic_update_lookup <- function(taxa, + stable_or_current_data = "stable", + version = default_version(), + one_to_many = "return_all", + full = FALSE, + resources = load_taxonomic_resources(stable_or_current_data = + stable_or_current_data, + version = version), + output = NULL) { + validate_one_to_many_input(one_to_many) + aligned_data <- get_aligned_data(taxa, resources) + updated_species_list <- + get_updated_species_list(aligned_data, resources, one_to_many, output) + + if (one_to_many == "collapse_to_higher_taxon") { + return(collapse_to_higher_taxon(updated_species_list, resources)) } + + if (full == TRUE) { + return(updated_species_list) + } else { + return( + dplyr::select( + updated_species_list, + original_name, + aligned_name, + apc_name = canonical_name, + aligned_reason, + taxonomic_status_of_aligned_name = taxonomicStatusClean + ) %>% + distinct() + ) + } +} - - -test_input <- function(input_char) { +#' @noRd +validate_one_to_many_input <- function(one_to_many) { valid_inputs <- c("return_all", "collapse_to_higher_taxon", "most_likely_species") - if (!input_char %in% valid_inputs) { + if (!one_to_many %in% valid_inputs) stop( paste( "Invalid input:", - input_char, + one_to_many, ". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'." ) ) - } else { - return(TRUE) - } } +#' @noRd +get_aligned_data <- function(taxa, resources) { + unique(taxa) %>% align_taxa(resources = resources) +} + + +#' Wrapper for update_taxonomy that either summarizes to one species or returns all matches +#' @noRd +get_updated_species_list <- + function(aligned_data, resources, one_to_many, output) { + aligned_species_list_tmp <- + aligned_data$aligned_name %>% update_taxonomy(resources = resources, output = output) + + if (one_to_many %in% c("return_all", "collapse_to_higher_taxon")) { + multiple = "all" + } else { + multiple = "first" + } + + aligned_data %>% + dplyr::select(original_name, aligned_name, aligned_reason) %>% + dplyr::left_join(aligned_species_list_tmp, + by = c("aligned_name"), + multiple = multiple) %>% + dplyr::filter(!is.na(taxonIDClean)) %>% + dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>% + dplyr::rename(canonical_name = canonicalName) + } + +#' Currently only collapses to genus or all the way to plants +#' @noRd +collapse_to_higher_taxon <- + function(aligned_species_list, resources) { + aligned_species_list %>% + group_by(original_name, aligned_name) %>% + summarise( + apc_names = find_mrct(canonical_name, resources = resources), + aligned_reason = paste(unique(aligned_reason), collapse = " and "), + taxonomicStatus = paste(unique(taxonomicStatusClean), collapse = " and "), + source = paste(unique(source), collapse = " and "), + number_of_collapsed_taxa = n() + ) + } -#not working yet +#' @noRd find_mrct <- function(taxa, stable_or_current_data = "stable", version = default_version(), - resources = load_taxonomic_resources(stable_or_current_data = - stable_or_current_data, version = version)) { - only_taxa_of_interest <- + resources = load_taxonomic_resources(stable_or_current_data = + stable_or_current_data, + version = version)) { + # Filter the resources data to only include the taxa of interest + relevant_taxa <- dplyr::filter(resources$APC, resources$APC$canonicalName %in% taxa) - if (length(unique(only_taxa_of_interest$canonicalName)) == 1) - #all the same - return(only_taxa_of_interest$canonicalName[1]) - if (length(unique(stringr::word( - only_taxa_of_interest$canonicalName, 1, 2 - ))) == 1) - #all species the same; different supspecific taxa - return(stringr::word(only_taxa_of_interest$canonicalName[1], 1, 2)) - if (length(unique(stringr::word( - only_taxa_of_interest$canonicalName, 1, 1 - ))) == 1) - #all genera the same but different species - return(paste0( - stringr::word(only_taxa_of_interest$canonicalName[1], 1, 1), - " sp." - )) - if (length(unique(only_taxa_of_interest$family)) == 1) - #all family the same but different genera - return(only_taxa_of_interest$family[1]) - return("plants") + + # Check different scenarios to find the most recent common taxon + unique_canonical_names <- unique(relevant_taxa$canonicalName) + unique_genus_species <- + unique(stringr::word(unique_canonical_names, 1, 2)) + unique_genus <- + unique(stringr::word(unique_canonical_names, 1, 1)) + unique_family <- unique(relevant_taxa$family) + + if (length(unique_canonical_names) == 1) { + # All taxa are the same + return(unique_canonical_names[1]) + } else if (length(unique_genus_species) == 1) { + # All species are the same, but different subspecific taxa + return(stringr::word(unique_canonical_names[1], 1, 2)) + } else if (length(unique_genus) == 1) { + # All genera are the same, but different species + return(paste0(unique_genus, " sp.")) + } else if (length(unique_family) == 1) { + # All families are the same, but different genera + return(unique_family[1]) + } else { + # Return "plants" for other cases + return("plants") + } } diff --git a/R/state_diversity.R b/R/state_diversity.R index a0c9c02e..7ccb153a 100644 --- a/R/state_diversity.R +++ b/R/state_diversity.R @@ -23,87 +23,69 @@ #' @examples #' create_species_state_origin_matrix() #' -create_species_state_origin_matrix <- - function(resources = load_taxonomic_resources()) { - apc_species <- - dplyr::filter(resources$APC, - taxonRank == "Species" & - taxonomicStatus == "accepted") - #seperate the states - sep_state_data <- - stringr::str_split(unique(apc_species$taxonDistribution), ",") - - #get unique places - all_codes <- unique(stringr::str_trim(unlist(sep_state_data))) - apc_places <- - unique(stringr::word(all_codes[!is.na(all_codes)], 1, 1)) - - #make a table to fill in - data.frame(col.names = apc_places) - species_df <- - dplyr::tibble(species = apc_species$scientificName) - for (i in 1:length(apc_places)) { - species_df <- - dplyr::bind_cols(species_df, NA, .name_repair = "minimal") - } - names(species_df) <- c("species", apc_places) - - #look for all possible entries after each state - state_parse_and_add_column <- - function(species_df, state, apc_species) { - # print(all_codes[grepl(state,all_codes)]) # checking for weird ones - species_df[, state] <- dplyr::case_when( - grepl( - paste0("\\b", state, " \\(uncertain origin\\)"), - apc_species$taxonDistribution - ) ~ "uncertain origin", - grepl( - paste0("\\b", state, " \\(naturalised\\)"), - apc_species$taxonDistribution - ) ~ "naturalised", - grepl( - paste0("\\b", state, " \\(doubtfully naturalised\\)"), - apc_species$taxonDistribution - ) ~ "doubtfully naturalised", - grepl( - paste0("\\b", state, " \\(native and naturalised\\)"), - apc_species$taxonDistribution - ) ~ "native and naturalised", - grepl( - paste0("\\b", state, " \\(formerly naturalised\\)"), - apc_species$taxonDistribution - ) ~ "formerly naturalised", - grepl( - paste0("\\b", state, " \\(presumed extinct\\)"), - apc_species$taxonDistribution - ) ~ "presumed extinct", - grepl( - paste0("\\b", state, " \\(native and doubtfully naturalised\\)"), - apc_species$taxonDistribution - ) ~ "native and doubtfully naturalised", - grepl( - paste0("\\b", state, " \\(native and uncertain origin\\)"), - apc_species$taxonDistribution - ) ~ "native and uncertain origin", - grepl(paste0("\\b", state), apc_species$taxonDistribution) ~ "native", - #no entry = native, it's important this is last in the list - TRUE ~ "not present" - ) - return(species_df) - } - - #bug checking - #species_df<-state_parse_and_add_column(species_df,"LHI",apc_species) - #species_df<-state_parse_and_add_column(species_df,"HI",apc_species) - - #go through the states one by one - for (i in 1:length(apc_places)) { - species_df <- - state_parse_and_add_column(species_df, apc_places[i], apc_species) - } - return(species_df) +#' +#' +create_species_state_origin_matrix <- function(resources = load_taxonomic_resources()) { + apc_species <- filter_data_to_accepted_species(resources) + sep_state_data <- separate_states(apc_species) + apc_places <- identify_places(sep_state_data) + species_df <- create_species_df(apc_places, apc_species) + result_df <- parse_states(species_df, apc_places, apc_species) + return(result_df) +} + +#' @noRd +filter_data_to_accepted_species <- function(resources) { + dplyr::filter(resources$APC, + taxonRank == "Species" & + taxonomicStatus == "accepted") +} + +#' @noRd +separate_states <- function(data) { + stringr::str_split(unique(data$taxonDistribution), ",") +} + +#' @noRd +identify_places <- function(sep_state_data) { + all_codes <- unique(stringr::str_trim(unlist(sep_state_data))) + unique(stringr::word(all_codes[!is.na(all_codes)], 1, 1)) +} + +#' @noRd +create_species_df <- function(apc_places, apc_species) { + species_df <- dplyr::tibble(species = apc_species$scientificName) + for (i in 1:length(apc_places)) { + species_df <- dplyr::bind_cols(species_df, NA, .name_repair = "minimal") } + names(species_df) <- c("species", apc_places) + return(species_df) +} +#' @noRd +state_parse_and_add_column <- function(species_df, state, apc_species) { + species_df[, state] <- dplyr::case_when( + grepl(paste0("\\b", state, " \\(uncertain origin\\)"), apc_species$taxonDistribution) ~ "uncertain origin", + grepl(paste0("\\b", state, " \\(naturalised\\)"), apc_species$taxonDistribution) ~ "naturalised", + grepl(paste0("\\b", state, " \\(doubtfully naturalised\\)"), apc_species$taxonDistribution) ~ "doubtfully naturalised", + grepl(paste0("\\b", state, " \\(native and naturalised\\)"), apc_species$taxonDistribution) ~ "native and naturalised", + grepl(paste0("\\b", state, " \\(formerly naturalised\\)"), apc_species$taxonDistribution) ~ "formerly naturalised", + grepl(paste0("\\b", state, " \\(presumed extinct\\)"), apc_species$taxonDistribution) ~ "presumed extinct", + grepl(paste0("\\b", state, " \\(native and doubtfully naturalised\\)"), apc_species$taxonDistribution) ~ "native and doubtfully naturalised", + grepl(paste0("\\b", state, " \\(native and uncertain origin\\)"), apc_species$taxonDistribution) ~ "native and uncertain origin", + grepl(paste0("\\b", state), apc_species$taxonDistribution) ~ "native", + TRUE ~ "not present" + ) + return(species_df) +} + +#' @noRd +parse_states <- function(species_df, apc_places, apc_species) { + for (i in 1:length(apc_places)) { + species_df <- state_parse_and_add_column(species_df, apc_places[i], apc_species) + } + return(species_df) +} diff --git a/man/create_species_state_origin_matrix.Rd b/man/create_species_state_origin_matrix.Rd index 60750158..f0895853 100644 --- a/man/create_species_state_origin_matrix.Rd +++ b/man/create_species_state_origin_matrix.Rd @@ -18,6 +18,8 @@ This function processes the geographic data available in the current or any vers \examples{ create_species_state_origin_matrix() + + } \seealso{ \code{\link{load_taxonomic_resources}} diff --git a/man/create_taxonomic_update_lookup.Rd b/man/create_taxonomic_update_lookup.Rd index b0bed753..c2dd5906 100644 --- a/man/create_taxonomic_update_lookup.Rd +++ b/man/create_taxonomic_update_lookup.Rd @@ -11,7 +11,8 @@ create_taxonomic_update_lookup( one_to_many = "return_all", full = FALSE, resources = load_taxonomic_resources(stable_or_current_data = stable_or_current_data, - version = version) + version = version), + output = NULL ) } \arguments{ @@ -21,11 +22,13 @@ create_taxonomic_update_lookup( \item{version}{The version number of the dataset to use.} -\item{one_to_many}{How to handle one_to_many taxonomic matches. Default is "return_all". The other options are "collapse_to_higher_taxon" and "most likely species". Most likely species defaults to the original_name if that name is accepted by the APC.} +\item{one_to_many}{How to handle one_to_many taxonomic matches. Default is "return_all". The other options are "collapse_to_higher_taxon" and "most_likely_species". most_likely_species defaults to the original_name if that name is accepted by the APC; this will be right for certain species subsets, but make errors in other cases, use with caution.} \item{full}{logical for whether the full lookup table is returned or just the two key columns} -\item{resources}{These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} seperately and pass the data in.} +\item{resources}{These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in.} + +\item{output}{file path to save the intermediate output to} } \value{ A lookup table containing the original species names, the aligned species names, and additional taxonomic information such as taxon IDs and genera.