Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chat gpt for elegance on Will's functions and carrying the output file input through #63

Merged
merged 5 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 115 additions & 109 deletions R/clean_names.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@




#' Find taxonomic alignments for a list of names to a version of the Australian Plant Census (APC) through standardizing formatting and checking for spelling issues
#'
#' This function uses Australian Plant Census (APC) & the Australian Plant Name Index (APNI) to find taxonomic alignments for a list of names.
Expand Down Expand Up @@ -76,7 +77,7 @@ align_taxa <- function(original_name,
dplyr::bind_rows(
taxa_raw,
tibble::tibble(
original_name = subset(original_name,!original_name %in% taxa_raw$original_name) %>% unique(),
original_name = subset(original_name, !original_name %in% taxa_raw$original_name) %>% unique(),
cleaned_name = NA_character_,
stripped_name = NA_character_,
stripped_name2 = NA_character_,
Expand Down Expand Up @@ -486,10 +487,11 @@ standardise_names <- function(taxon_names) {
#'
#' @param taxa A list of Australian plant species that needs to be reconciled with current taxonomy.
#' @param stable_or_current_data either "stable" for a consistent version, or "current" for the leading edge version.
#' @param version The version number of the dataset to use.
#' @param one_to_many How to handle one_to_many taxonomic matches. Default is "return_all". The other options are "collapse_to_higher_taxon" and "most likely species". Most likely species defaults to the original_name if that name is accepted by the APC.
#' @param version The version number of the dataset to use.
#' @param one_to_many How to handle one_to_many taxonomic matches. Default is "return_all". The other options are "collapse_to_higher_taxon" and "most_likely_species". most_likely_species defaults to the original_name if that name is accepted by the APC; this will be right for certain species subsets, but make errors in other cases, use with caution.
#' @param full logical for whether the full lookup table is returned or just the two key columns
#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} seperately and pass the data in.
#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in.
#' @param output file path to save the intermediate output to
#' @return A lookup table containing the original species names, the aligned species names, and additional taxonomic information such as taxon IDs and genera.
#' @export
#'
Expand All @@ -502,136 +504,140 @@ standardise_names <- function(taxon_names) {
#' "Not a species"),
#' resources=resources)
#'
create_taxonomic_update_lookup <-
function(taxa,
stable_or_current_data = "stable",
version = default_version(),
one_to_many = "return_all",
full = FALSE,
resources = load_taxonomic_resources(stable_or_current_data =
stable_or_current_data, version = version)) {
valid_inputs <-
c("return_all",
"collapse_to_higher_taxon",
"most_likely_species")
if (!one_to_many %in% valid_inputs)
stop(
paste(
"Invalid input:",
input_char,
". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'."
)
create_taxonomic_update_lookup <- function(taxa,
stable_or_current_data = "stable",
version = default_version(),
one_to_many = "return_all",
full = FALSE,
resources = load_taxonomic_resources(stable_or_current_data =
stable_or_current_data,
version = version),
output = NULL) {
validate_one_to_many_input(one_to_many)
aligned_data <- get_aligned_data(taxa, resources)
updated_species_list <-
get_updated_species_list(aligned_data, resources, one_to_many, output)

if (one_to_many == "collapse_to_higher_taxon") {
return(collapse_to_higher_taxon(updated_species_list, resources))
}

if (full == TRUE) {
return(updated_species_list)
} else {
return(
dplyr::select(
updated_species_list,
original_name,
aligned_name,
apc_name = canonical_name,
aligned_reason,
taxonomic_status_of_aligned_name = taxonomicStatusClean
) %>%
distinct()
)
}
}

#' @noRd
validate_one_to_many_input <- function(one_to_many) {
valid_inputs <-
c("return_all",
"collapse_to_higher_taxon",
"most_likely_species")
if (!one_to_many %in% valid_inputs)
stop(
paste(
"Invalid input:",
one_to_many,
". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'."
)

aligned_data <-
unique(taxa) %>%
align_taxa(resources = resources)

)
}

#' @noRd
get_aligned_data <- function(taxa, resources) {
unique(taxa) %>% align_taxa(resources = resources)
}


#' Wrapper for update_taxonomy that either summarizes to one species or returns all matches
#' @noRd
get_updated_species_list <-
function(aligned_data, resources, one_to_many, output) {
aligned_species_list_tmp <-
aligned_data$aligned_name %>% update_taxonomy(resources = resources)
aligned_data$aligned_name %>% update_taxonomy(resources = resources, output = output)

#should really be a function, but i'm not smart enough to see how to handle the outputs being different and the early return
if (one_to_many %in% c("return_all", "collapse_to_higher_taxon")) {
aligned_species_list <-
aligned_data %>% dplyr::select(original_name, aligned_name, aligned_reason) %>%
aligned_data %>%
dplyr::select(original_name, aligned_name, aligned_reason) %>%
dplyr::left_join(aligned_species_list_tmp,
by = c("aligned_name"),
multiple = "all") %>%
dplyr::filter(!is.na(taxonIDClean)) %>%
dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>%
dplyr::rename(canonical_name = canonicalName)
}

if (one_to_many == c("most_likely_species")) {
aligned_species_list <-
aligned_data %>% dplyr::select(original_name, aligned_name, aligned_reason) %>%
} else {
aligned_data %>%
dplyr::select(original_name, aligned_name, aligned_reason) %>%
dplyr::left_join(aligned_species_list_tmp,
by = c("aligned_name"),
multiple = "first") %>%
dplyr::filter(!is.na(taxonIDClean)) %>%
dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>%
dplyr::rename(canonical_name = canonicalName)
}

if (one_to_many == "collapse_to_higher_taxon") {
aligned_species_list %>%
group_by(original_name, aligned_name) %>%
summarise(
apc_names = find_mrct(canonical_name, resources = resources),
aligned_reason = paste(unique(aligned_reason), collapse = " and "),
taxonomicStatus = paste(unique(taxonomicStatusClean), collapse = " and "),
source = paste(unique(source), collapse = " and "),
number_of_collapsed_taxa = n()
) -> test
return(test)
}

if (full == TRUE) {
return(aligned_species_list)
}
if (full == FALSE) {
return(
dplyr::select(
aligned_species_list,
original_name,
aligned_name,
apc_name = canonical_name,
aligned_reason,
taxonomic_status_of_aligned_name = taxonomicStatusClean
) %>%
distinct() #may not be necessary
)
}
}



test_input <- function(input_char) {
valid_inputs <-
c("return_all",
"collapse_to_higher_taxon",
"most_likely_species")
if (!input_char %in% valid_inputs) {
stop(
paste(
"Invalid input:",
input_char,
". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'."
#' Currently only collapses to genus or all the way to plants
#' @noRd
collapse_to_higher_taxon <-
function(aligned_species_list, resources) {
aligned_species_list %>%
group_by(original_name, aligned_name) %>%
summarise(
apc_names = find_mrct(canonical_name, resources = resources),
aligned_reason = paste(unique(aligned_reason), collapse = " and "),
taxonomicStatus = paste(unique(taxonomicStatusClean), collapse = " and "),
source = paste(unique(source), collapse = " and "),
number_of_collapsed_taxa = n()
)
)
} else {
return(TRUE)
}
}



#not working yet
#' @noRd
find_mrct <- function(taxa,
wcornwell marked this conversation as resolved.
Show resolved Hide resolved
stable_or_current_data = "stable",
version = default_version(),
resources = load_taxonomic_resources(stable_or_current_data =
stable_or_current_data, version = version)) {
only_taxa_of_interest <-
resources = load_taxonomic_resources(stable_or_current_data =
stable_or_current_data,
version = version)) {
# Filter the resources data to only include the taxa of interest
relevant_taxa <-
dplyr::filter(resources$APC, resources$APC$canonicalName %in% taxa)
if (length(unique(only_taxa_of_interest$canonicalName)) == 1)
#all the same
return(only_taxa_of_interest$canonicalName[1])
if (length(unique(stringr::word(
only_taxa_of_interest$canonicalName, 1, 2
))) == 1)
#all species the same; different supspecific taxa
return(stringr::word(only_taxa_of_interest$canonicalName[1], 1, 2))
if (length(unique(stringr::word(
only_taxa_of_interest$canonicalName, 1, 1
))) == 1)
#all genera the same but different species
return(paste0(
stringr::word(only_taxa_of_interest$canonicalName[1], 1, 1),
" sp."
))
if (length(unique(only_taxa_of_interest$family)) == 1)
#all family the same but different genera
return(only_taxa_of_interest$family[1])
return("plants")

# Check different scenarios to find the most recent common taxon
unique_canonical_names <- unique(relevant_taxa$canonicalName)
unique_genus_species <-
unique(stringr::word(unique_canonical_names, 1, 2))
unique_genus <-
unique(stringr::word(unique_canonical_names, 1, 1))
unique_family <- unique(relevant_taxa$family)

if (length(unique_canonical_names) == 1) {
# All taxa are the same
return(unique_canonical_names[1])
} else if (length(unique_genus_species) == 1) {
# All species are the same, but different subspecific taxa
return(stringr::word(unique_canonical_names[1], 1, 2))
} else if (length(unique_genus) == 1) {
# All genera are the same, but different species
return(paste0(unique_genus, " sp."))
} else if (length(unique_family) == 1) {
# All families are the same, but different genera
return(unique_family[1])
} else {
# Return "plants" for other cases
return("plants")
}
}
Loading
Loading