traitecoevo · wcornwell · Jul 27, 2023 · Jul 25, 2023 · Jul 25, 2023 · Jul 26, 2023
diff --git a/R/clean_names.R b/R/clean_names.R
@@ -4,6 +4,7 @@
 
 
 
+
 #' Find taxonomic alignments for a list of names to a version of the Australian Plant Census (APC) through standardizing formatting and checking for spelling issues
 #'
 #' This function uses Australian Plant Census (APC) & the Australian Plant Name Index (APNI) to find taxonomic alignments for a list of names.
@@ -76,7 +77,7 @@ align_taxa <- function(original_name,
     dplyr::bind_rows(
       taxa_raw,
       tibble::tibble(
-        original_name = subset(original_name,!original_name %in% taxa_raw$original_name) %>% unique(),
+        original_name = subset(original_name, !original_name %in% taxa_raw$original_name) %>% unique(),
         cleaned_name = NA_character_,
         stripped_name = NA_character_,
         stripped_name2 = NA_character_,
@@ -486,10 +487,11 @@ standardise_names <- function(taxon_names) {
 #'
 #' @param taxa A list of Australian plant species that needs to be reconciled with current taxonomy.
 #' @param stable_or_current_data either "stable" for a consistent version, or "current" for the leading edge version.
-#' @param version The version number of the dataset to use. 
-#' @param one_to_many How to handle one_to_many taxonomic matches.  Default is "return_all".  The other options are "collapse_to_higher_taxon" and "most likely species". Most likely species defaults to the original_name if that name is accepted by the APC.
+#' @param version The version number of the dataset to use.
+#' @param one_to_many How to handle one_to_many taxonomic matches.  Default is "return_all".  The other options are "collapse_to_higher_taxon" and "most_likely_species". most_likely_species defaults to the original_name if that name is accepted by the APC; this will be right for certain species subsets, but make errors in other cases, use with caution.
 #' @param full logical for whether the full lookup table is returned or just the two key columns
-#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer.  If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} seperately and pass the data in.
+#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer.  If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in.
+#' @param output file path to save the intermediate output to
 #' @return A lookup table containing the original species names, the aligned species names, and additional taxonomic information such as taxon IDs and genera.
 #' @export
 #'
@@ -502,136 +504,140 @@ standardise_names <- function(taxon_names) {
 #'                                  "Not a species"),
 #'                                  resources=resources)
 #'
-create_taxonomic_update_lookup <-
-  function(taxa,
-           stable_or_current_data = "stable",
-           version = default_version(),
-           one_to_many = "return_all",
-           full = FALSE,
-           resources = load_taxonomic_resources(stable_or_current_data =
-                                                  stable_or_current_data, version = version)) {
-    valid_inputs <-
-      c("return_all",
-        "collapse_to_higher_taxon",
-        "most_likely_species")
-    if (!one_to_many %in% valid_inputs)
-      stop(
-        paste(
-          "Invalid input:",
-          input_char,
-          ". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'."
-        )
+create_taxonomic_update_lookup <- function(taxa,
+                                           stable_or_current_data = "stable",
+                                           version = default_version(),
+                                           one_to_many = "return_all",
+                                           full = FALSE,
+                                           resources = load_taxonomic_resources(stable_or_current_data = 
+                                                                                  stable_or_current_data, 
+                                                                                version = version),
+                                           output = NULL) {
+  validate_one_to_many_input(one_to_many)
+  aligned_data <- get_aligned_data(taxa, resources)
+  updated_species_list <-
+    get_updated_species_list(aligned_data, resources, one_to_many, output)
+
+  if (one_to_many == "collapse_to_higher_taxon") {
+    return(collapse_to_higher_taxon(updated_species_list, resources))
+  }
+
+  if (full == TRUE) {
+    return(updated_species_list)
+  } else {
+    return(
+      dplyr::select(
+        updated_species_list,
+        original_name,
+        aligned_name,
+        apc_name = canonical_name,
+        aligned_reason,
+        taxonomic_status_of_aligned_name = taxonomicStatusClean
+      ) %>%
+        distinct()
+    )
+  }
+}
+
+#' @noRd
+validate_one_to_many_input <- function(one_to_many) {
+  valid_inputs <-
+    c("return_all",
+      "collapse_to_higher_taxon",
+      "most_likely_species")
+  if (!one_to_many %in% valid_inputs)
+    stop(
+      paste(
+        "Invalid input:",
+        one_to_many,
+        ". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'."
       )
-
-    aligned_data <-
-      unique(taxa) %>%
-      align_taxa(resources = resources)
-
+    )
+}
+
+#' @noRd
+get_aligned_data <- function(taxa, resources) {
+  unique(taxa) %>% align_taxa(resources = resources)
+}
+
+
+#' Wrapper for update_taxonomy that either summarizes to one species or returns all matches
+#' @noRd
+get_updated_species_list <-
+  function(aligned_data, resources, one_to_many, output) {
     aligned_species_list_tmp <-
-      aligned_data$aligned_name %>% update_taxonomy(resources = resources)
+      aligned_data$aligned_name %>% update_taxonomy(resources = resources, output = output)
 
-    #should really be a function, but i'm not smart enough to see how to handle the outputs being different and the early return
     if (one_to_many %in% c("return_all", "collapse_to_higher_taxon")) {
-      aligned_species_list <-
-        aligned_data %>% dplyr::select(original_name, aligned_name, aligned_reason) %>%
+      aligned_data %>%
+        dplyr::select(original_name, aligned_name, aligned_reason) %>%
         dplyr::left_join(aligned_species_list_tmp,
                          by = c("aligned_name"),
                          multiple = "all") %>%
         dplyr::filter(!is.na(taxonIDClean)) %>%
         dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>%
         dplyr::rename(canonical_name = canonicalName)
-    }
-
-    if (one_to_many == c("most_likely_species")) {
-      aligned_species_list <-
-        aligned_data %>% dplyr::select(original_name, aligned_name, aligned_reason) %>%
+    } else {
+      aligned_data %>%
+        dplyr::select(original_name, aligned_name, aligned_reason) %>%
         dplyr::left_join(aligned_species_list_tmp,
                          by = c("aligned_name"),
                          multiple = "first") %>%
         dplyr::filter(!is.na(taxonIDClean)) %>%
         dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>%
         dplyr::rename(canonical_name = canonicalName)
     }
-
-    if (one_to_many == "collapse_to_higher_taxon") {
-      aligned_species_list %>%
-        group_by(original_name, aligned_name) %>%
-        summarise(
-          apc_names = find_mrct(canonical_name, resources = resources),
-          aligned_reason = paste(unique(aligned_reason), collapse = " and "),
-          taxonomicStatus = paste(unique(taxonomicStatusClean), collapse = " and "),
-          source = paste(unique(source), collapse = " and "),
-          number_of_collapsed_taxa = n()
-        ) -> test
-      return(test)
-    }
-
-    if (full == TRUE) {
-      return(aligned_species_list)
-    }
-    if (full == FALSE) {
-      return(
-        dplyr::select(
-          aligned_species_list,
-          original_name,
-          aligned_name,
-          apc_name = canonical_name,
-          aligned_reason,
-          taxonomic_status_of_aligned_name = taxonomicStatusClean
-        ) %>%
-          distinct() #may not be necessary
-      )
-    }
   }
 
 
-
-test_input <- function(input_char) {
-  valid_inputs <-
-    c("return_all",
-      "collapse_to_higher_taxon",
-      "most_likely_species")
-  if (!input_char %in% valid_inputs) {
-    stop(
-      paste(
-        "Invalid input:",
-        input_char,
-        ". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'."
+#' Currently only collapses to genus or all the way to plants
+#' @noRd
+collapse_to_higher_taxon <-
+  function(aligned_species_list, resources) {
+    aligned_species_list %>%
+      group_by(original_name, aligned_name) %>%
+      summarise(
+        apc_names = find_mrct(canonical_name, resources = resources),
+        aligned_reason = paste(unique(aligned_reason), collapse = " and "),
+        taxonomicStatus = paste(unique(taxonomicStatusClean), collapse = " and "),
+        source = paste(unique(source), collapse = " and "),
+        number_of_collapsed_taxa = n()
       )
-    )
-  } else {
-    return(TRUE)
   }
-}
 
-
-
-#not working yet
+#' @noRd
 find_mrct <- function(taxa,
                       stable_or_current_data = "stable",
                       version = default_version(),
-                      resources = load_taxonomic_resources(stable_or_current_data =
-                                                             stable_or_current_data, version = version)) {
-  only_taxa_of_interest <-
+                      resources = load_taxonomic_resources(stable_or_current_data = 
+                                                             stable_or_current_data, 
+                                                           version = version)) {
+  # Filter the resources data to only include the taxa of interest
+  relevant_taxa <-
     dplyr::filter(resources$APC, resources$APC$canonicalName %in% taxa)
-  if (length(unique(only_taxa_of_interest$canonicalName)) == 1)
-    #all the same
-    return(only_taxa_of_interest$canonicalName[1])
-  if (length(unique(stringr::word(
-    only_taxa_of_interest$canonicalName, 1, 2
-  ))) == 1)
-    #all species the same; different supspecific taxa
-    return(stringr::word(only_taxa_of_interest$canonicalName[1], 1, 2))
-  if (length(unique(stringr::word(
-    only_taxa_of_interest$canonicalName, 1, 1
-  ))) == 1)
-    #all genera the same but different species
-    return(paste0(
-      stringr::word(only_taxa_of_interest$canonicalName[1], 1, 1),
-      " sp."
-    ))
-  if (length(unique(only_taxa_of_interest$family)) == 1)
-    #all family the same but different genera
-    return(only_taxa_of_interest$family[1])
-  return("plants")
+
+  # Check different scenarios to find the most recent common taxon
+  unique_canonical_names <- unique(relevant_taxa$canonicalName)
+  unique_genus_species <-
+    unique(stringr::word(unique_canonical_names, 1, 2))
+  unique_genus <-
+    unique(stringr::word(unique_canonical_names, 1, 1))
+  unique_family <- unique(relevant_taxa$family)
+
+  if (length(unique_canonical_names) == 1) {
+    # All taxa are the same
+    return(unique_canonical_names[1])
+  } else if (length(unique_genus_species) == 1) {
+    # All species are the same, but different subspecific taxa
+    return(stringr::word(unique_canonical_names[1], 1, 2))
+  } else if (length(unique_genus) == 1) {
+    # All genera are the same, but different species
+    return(paste0(unique_genus, " sp."))
+  } else if (length(unique_family) == 1) {
+    # All families are the same, but different genera
+    return(unique_family[1])
+  } else {
+    # Return "plants" for other cases
+    return("plants")
+  }
 }