diff --git a/R/clean_names.R b/R/clean_names.R
index 3f112c07..6c286dd6 100644
--- a/R/clean_names.R
+++ b/R/clean_names.R
@@ -4,6 +4,7 @@
 
 
 
+
 #' Find taxonomic alignments for a list of names to a version of the Australian Plant Census (APC) through standardizing formatting and checking for spelling issues
 #'
 #' This function uses Australian Plant Census (APC) & the Australian Plant Name Index (APNI) to find taxonomic alignments for a list of names.
@@ -76,7 +77,7 @@ align_taxa <- function(original_name,
     dplyr::bind_rows(
       taxa_raw,
       tibble::tibble(
-        original_name = subset(original_name,!original_name %in% taxa_raw$original_name) %>% unique(),
+        original_name = subset(original_name, !original_name %in% taxa_raw$original_name) %>% unique(),
         cleaned_name = NA_character_,
         stripped_name = NA_character_,
         stripped_name2 = NA_character_,
@@ -486,10 +487,11 @@ standardise_names <- function(taxon_names) {
 #'
 #' @param taxa A list of Australian plant species that needs to be reconciled with current taxonomy.
 #' @param stable_or_current_data either "stable" for a consistent version, or "current" for the leading edge version.
-#' @param version The version number of the dataset to use. 
-#' @param one_to_many How to handle one_to_many taxonomic matches.  Default is "return_all".  The other options are "collapse_to_higher_taxon" and "most likely species". Most likely species defaults to the original_name if that name is accepted by the APC.
+#' @param version The version number of the dataset to use.
+#' @param one_to_many How to handle one_to_many taxonomic matches.  Default is "return_all".  The other options are "collapse_to_higher_taxon" and "most_likely_species". most_likely_species defaults to the original_name if that name is accepted by the APC; this will be right for certain species subsets, but make errors in other cases, use with caution.
 #' @param full logical for whether the full lookup table is returned or just the two key columns
-#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer.  If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} seperately and pass the data in.
+#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer.  If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in.
+#' @param output file path to save the intermediate output to
 #' @return A lookup table containing the original species names, the aligned species names, and additional taxonomic information such as taxon IDs and genera.
 #' @export
 #'
@@ -502,136 +504,135 @@ standardise_names <- function(taxon_names) {
 #'                                  "Not a species"),
 #'                                  resources=resources)
 #'
-create_taxonomic_update_lookup <-
-  function(taxa,
-           stable_or_current_data = "stable",
-           version = default_version(),
-           one_to_many = "return_all",
-           full = FALSE,
-           resources = load_taxonomic_resources(stable_or_current_data =
-                                                  stable_or_current_data, version = version)) {
-    valid_inputs <-
-      c("return_all",
-        "collapse_to_higher_taxon",
-        "most_likely_species")
-    if (!one_to_many %in% valid_inputs)
-      stop(
-        paste(
-          "Invalid input:",
-          input_char,
-          ". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'."
-        )
-      )
-    
-    aligned_data <-
-      unique(taxa) %>%
-      align_taxa(resources = resources)
-    
-    aligned_species_list_tmp <-
-      aligned_data$aligned_name %>% update_taxonomy(resources = resources)
-    
-    #should really be a function, but i'm not smart enough to see how to handle the outputs being different and the early return
-    if (one_to_many %in% c("return_all", "collapse_to_higher_taxon")) {
-      aligned_species_list <-
-        aligned_data %>% dplyr::select(original_name, aligned_name, aligned_reason) %>%
-        dplyr::left_join(aligned_species_list_tmp,
-                         by = c("aligned_name"),
-                         multiple = "all") %>%
-        dplyr::filter(!is.na(taxonIDClean)) %>%
-        dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>%
-        dplyr::rename(canonical_name = canonicalName)
-    }
-    
-    if (one_to_many == c("most_likely_species")) {
-      aligned_species_list <-
-        aligned_data %>% dplyr::select(original_name, aligned_name, aligned_reason) %>%
-        dplyr::left_join(aligned_species_list_tmp,
-                         by = c("aligned_name"),
-                         multiple = "first") %>%
-        dplyr::filter(!is.na(taxonIDClean)) %>%
-        dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>%
-        dplyr::rename(canonical_name = canonicalName)
-    }
-    
-    if (one_to_many == "collapse_to_higher_taxon") {
-      aligned_species_list %>%
-        group_by(original_name, aligned_name) %>%
-        summarise(
-          apc_names = find_mrct(canonical_name, resources = resources),
-          aligned_reason = paste(unique(aligned_reason), collapse = " and "),
-          taxonomicStatus = paste(unique(taxonomicStatusClean), collapse = " and "),
-          source = paste(unique(source), collapse = " and "),
-          number_of_collapsed_taxa = n()
-        ) -> test
-      return(test)
-    }
-    
-    if (full == TRUE) {
-      return(aligned_species_list)
-    }
-    if (full == FALSE) {
-      return(
-        dplyr::select(
-          aligned_species_list,
-          original_name,
-          aligned_name,
-          apc_name = canonical_name,
-          aligned_reason,
-          taxonomic_status_of_aligned_name = taxonomicStatusClean
-        ) %>%
-          distinct() #may not be necessary
-      )
-    }
+create_taxonomic_update_lookup <- function(taxa,
+                                           stable_or_current_data = "stable",
+                                           version = default_version(),
+                                           one_to_many = "return_all",
+                                           full = FALSE,
+                                           resources = load_taxonomic_resources(stable_or_current_data = 
+                                                                                  stable_or_current_data, 
+                                                                                version = version),
+                                           output = NULL) {
+  validate_one_to_many_input(one_to_many)
+  aligned_data <- get_aligned_data(taxa, resources)
+  updated_species_list <-
+    get_updated_species_list(aligned_data, resources, one_to_many, output)
+  
+  if (one_to_many == "collapse_to_higher_taxon") {
+    return(collapse_to_higher_taxon(updated_species_list, resources))
   }
+  
+  if (full == TRUE) {
+    return(updated_species_list)
+  } else {
+    return(
+      dplyr::select(
+        updated_species_list,
+        original_name,
+        aligned_name,
+        apc_name = canonical_name,
+        aligned_reason,
+        taxonomic_status_of_aligned_name = taxonomicStatusClean
+      ) %>%
+        distinct()
+    )
+  }
+}
 
-
-
-test_input <- function(input_char) {
+#' @noRd
+validate_one_to_many_input <- function(one_to_many) {
   valid_inputs <-
     c("return_all",
       "collapse_to_higher_taxon",
       "most_likely_species")
-  if (!input_char %in% valid_inputs) {
+  if (!one_to_many %in% valid_inputs)
     stop(
       paste(
         "Invalid input:",
-        input_char,
+        one_to_many,
         ". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'."
       )
     )
-  } else {
-    return(TRUE)
-  }
 }
 
+#' @noRd
+get_aligned_data <- function(taxa, resources) {
+  unique(taxa) %>% align_taxa(resources = resources)
+}
+
+
+#' Wrapper for update_taxonomy that either summarizes to one species or returns all matches
+#' @noRd
+get_updated_species_list <-
+  function(aligned_data, resources, one_to_many, output) {
+    aligned_species_list_tmp <-
+      aligned_data$aligned_name %>% update_taxonomy(resources = resources, output = output)
+    
+    if (one_to_many %in% c("return_all", "collapse_to_higher_taxon")) {
+      multiple = "all"
+    } else {
+      multiple = "first"
+    }
+
+    aligned_data %>%
+      dplyr::select(original_name, aligned_name, aligned_reason) %>%
+      dplyr::left_join(aligned_species_list_tmp,
+                        by = c("aligned_name"),
+                        multiple = multiple) %>%
+      dplyr::filter(!is.na(taxonIDClean)) %>%
+      dplyr::mutate(genus = stringr::word(canonicalName, 1, 1)) %>%
+      dplyr::rename(canonical_name = canonicalName)
+  }
+
 
+#' Currently only collapses to genus or all the way to plants
+#' @noRd
+collapse_to_higher_taxon <-
+  function(aligned_species_list, resources) {
+    aligned_species_list %>%
+      group_by(original_name, aligned_name) %>%
+      summarise(
+        apc_names = find_mrct(canonical_name, resources = resources),
+        aligned_reason = paste(unique(aligned_reason), collapse = " and "),
+        taxonomicStatus = paste(unique(taxonomicStatusClean), collapse = " and "),
+        source = paste(unique(source), collapse = " and "),
+        number_of_collapsed_taxa = n()
+      )
+  }
 
-#not working yet
+#' @noRd
 find_mrct <- function(taxa,
                       stable_or_current_data = "stable",
                       version = default_version(),
-                      resources = load_taxonomic_resources(stable_or_current_data =
-                                                             stable_or_current_data, version = version)) {
-  only_taxa_of_interest <-
+                      resources = load_taxonomic_resources(stable_or_current_data = 
+                                                             stable_or_current_data, 
+                                                           version = version)) {
+  # Filter the resources data to only include the taxa of interest
+  relevant_taxa <-
     dplyr::filter(resources$APC, resources$APC$canonicalName %in% taxa)
-  if (length(unique(only_taxa_of_interest$canonicalName)) == 1)
-    #all the same
-    return(only_taxa_of_interest$canonicalName[1])
-  if (length(unique(stringr::word(
-    only_taxa_of_interest$canonicalName, 1, 2
-  ))) == 1)
-    #all species the same; different supspecific taxa
-    return(stringr::word(only_taxa_of_interest$canonicalName[1], 1, 2))
-  if (length(unique(stringr::word(
-    only_taxa_of_interest$canonicalName, 1, 1
-  ))) == 1)
-    #all genera the same but different species
-    return(paste0(
-      stringr::word(only_taxa_of_interest$canonicalName[1], 1, 1),
-      " sp."
-    ))
-  if (length(unique(only_taxa_of_interest$family)) == 1)
-    #all family the same but different genera
-    return(only_taxa_of_interest$family[1])
-  return("plants")
+  
+  # Check different scenarios to find the most recent common taxon
+  unique_canonical_names <- unique(relevant_taxa$canonicalName)
+  unique_genus_species <-
+    unique(stringr::word(unique_canonical_names, 1, 2))
+  unique_genus <-
+    unique(stringr::word(unique_canonical_names, 1, 1))
+  unique_family <- unique(relevant_taxa$family)
+  
+  if (length(unique_canonical_names) == 1) {
+    # All taxa are the same
+    return(unique_canonical_names[1])
+  } else if (length(unique_genus_species) == 1) {
+    # All species are the same, but different subspecific taxa
+    return(stringr::word(unique_canonical_names[1], 1, 2))
+  } else if (length(unique_genus) == 1) {
+    # All genera are the same, but different species
+    return(paste0(unique_genus, " sp."))
+  } else if (length(unique_family) == 1) {
+    # All families are the same, but different genera
+    return(unique_family[1])
+  } else {
+    # Return "plants" for other cases
+    return("plants")
+  }
 }
diff --git a/R/state_diversity.R b/R/state_diversity.R
index a0c9c02e..7ccb153a 100644
--- a/R/state_diversity.R
+++ b/R/state_diversity.R
@@ -23,87 +23,69 @@
 #' @examples
 #' create_species_state_origin_matrix()
 #'
-create_species_state_origin_matrix <-
-  function(resources = load_taxonomic_resources()) {
-    apc_species <-
-      dplyr::filter(resources$APC,
-                    taxonRank == "Species" &
-                      taxonomicStatus == "accepted")
-    #seperate the states
-    sep_state_data <-
-      stringr::str_split(unique(apc_species$taxonDistribution), ",")
-    
-    #get unique places
-    all_codes <- unique(stringr::str_trim(unlist(sep_state_data)))
-    apc_places <-
-      unique(stringr::word(all_codes[!is.na(all_codes)], 1, 1))
-    
-    #make a table to fill in
-    data.frame(col.names = apc_places)
-    species_df <-
-      dplyr::tibble(species = apc_species$scientificName)
-    for (i in 1:length(apc_places)) {
-      species_df <-
-        dplyr::bind_cols(species_df, NA, .name_repair = "minimal")
-    }
-    names(species_df) <- c("species", apc_places)
-    
-    #look for all possible entries after each state
-    state_parse_and_add_column <-
-      function(species_df, state, apc_species) {
-        # print(all_codes[grepl(state,all_codes)]) # checking for weird ones
-        species_df[, state] <- dplyr::case_when(
-          grepl(
-            paste0("\\b", state, " \\(uncertain origin\\)"),
-            apc_species$taxonDistribution
-          ) ~ "uncertain origin",
-          grepl(
-            paste0("\\b", state, " \\(naturalised\\)"),
-            apc_species$taxonDistribution
-          ) ~ "naturalised",
-          grepl(
-            paste0("\\b", state, " \\(doubtfully naturalised\\)"),
-            apc_species$taxonDistribution
-          ) ~ "doubtfully naturalised",
-          grepl(
-            paste0("\\b", state, " \\(native and naturalised\\)"),
-            apc_species$taxonDistribution
-          ) ~ "native and naturalised",
-          grepl(
-            paste0("\\b", state, " \\(formerly naturalised\\)"),
-            apc_species$taxonDistribution
-          ) ~ "formerly naturalised",
-          grepl(
-            paste0("\\b", state, " \\(presumed extinct\\)"),
-            apc_species$taxonDistribution
-          ) ~ "presumed extinct",
-          grepl(
-            paste0("\\b", state, " \\(native and doubtfully naturalised\\)"),
-            apc_species$taxonDistribution
-          ) ~ "native and doubtfully naturalised",
-          grepl(
-            paste0("\\b", state, " \\(native and uncertain origin\\)"),
-            apc_species$taxonDistribution
-          ) ~ "native and uncertain origin",
-          grepl(paste0("\\b", state), apc_species$taxonDistribution) ~ "native",
-          #no entry = native, it's important this is last in the list
-          TRUE ~ "not present"
-        )
-        return(species_df)
-      }
-    
-    #bug checking
-    #species_df<-state_parse_and_add_column(species_df,"LHI",apc_species)
-    #species_df<-state_parse_and_add_column(species_df,"HI",apc_species)
-    
-    #go through the states one by one
-    for (i in 1:length(apc_places)) {
-      species_df <-
-        state_parse_and_add_column(species_df, apc_places[i], apc_species)
-    }
-    return(species_df)
+#'
+#'
+create_species_state_origin_matrix <- function(resources = load_taxonomic_resources()) {
+  apc_species <- filter_data_to_accepted_species(resources)
+  sep_state_data <- separate_states(apc_species)
+  apc_places <- identify_places(sep_state_data)
+  species_df <- create_species_df(apc_places, apc_species)
+  result_df <- parse_states(species_df, apc_places, apc_species)
+  return(result_df)
+}
+
+#' @noRd
+filter_data_to_accepted_species <- function(resources) {
+  dplyr::filter(resources$APC,
+                taxonRank == "Species" &
+                  taxonomicStatus == "accepted")
+}
+
+#' @noRd
+separate_states <- function(data) {
+  stringr::str_split(unique(data$taxonDistribution), ",")
+}
+
+#' @noRd
+identify_places <- function(sep_state_data) {
+  all_codes <- unique(stringr::str_trim(unlist(sep_state_data)))
+  unique(stringr::word(all_codes[!is.na(all_codes)], 1, 1))
+}
+
+#' @noRd
+create_species_df <- function(apc_places, apc_species) {
+  species_df <- dplyr::tibble(species = apc_species$scientificName)
+  for (i in 1:length(apc_places)) {
+    species_df <- dplyr::bind_cols(species_df, NA, .name_repair = "minimal")
   }
+  names(species_df) <- c("species", apc_places)
+  return(species_df)
+}
 
+#' @noRd
+state_parse_and_add_column <- function(species_df, state, apc_species) {
+  species_df[, state] <- dplyr::case_when(
+    grepl(paste0("\\b", state, " \\(uncertain origin\\)"), apc_species$taxonDistribution) ~ "uncertain origin",
+    grepl(paste0("\\b", state, " \\(naturalised\\)"), apc_species$taxonDistribution) ~ "naturalised",
+    grepl(paste0("\\b", state, " \\(doubtfully naturalised\\)"), apc_species$taxonDistribution) ~ "doubtfully naturalised",
+    grepl(paste0("\\b", state, " \\(native and naturalised\\)"), apc_species$taxonDistribution) ~ "native and naturalised",
+    grepl(paste0("\\b", state, " \\(formerly naturalised\\)"), apc_species$taxonDistribution) ~ "formerly naturalised",
+    grepl(paste0("\\b", state, " \\(presumed extinct\\)"), apc_species$taxonDistribution) ~ "presumed extinct",
+    grepl(paste0("\\b", state, " \\(native and doubtfully naturalised\\)"), apc_species$taxonDistribution) ~ "native and doubtfully naturalised",
+    grepl(paste0("\\b", state, " \\(native and uncertain origin\\)"), apc_species$taxonDistribution) ~ "native and uncertain origin",
+    grepl(paste0("\\b", state), apc_species$taxonDistribution) ~ "native",
+    TRUE ~ "not present"
+  )
+  return(species_df)
+}
+
+#' @noRd
+parse_states <- function(species_df, apc_places, apc_species) {
+  for (i in 1:length(apc_places)) {
+    species_df <- state_parse_and_add_column(species_df, apc_places[i], apc_species)
+  }
+  return(species_df)
+}
 
 
 
diff --git a/man/create_species_state_origin_matrix.Rd b/man/create_species_state_origin_matrix.Rd
index 60750158..f0895853 100644
--- a/man/create_species_state_origin_matrix.Rd
+++ b/man/create_species_state_origin_matrix.Rd
@@ -18,6 +18,8 @@ This function processes the geographic data available in the current or any vers
 \examples{
 create_species_state_origin_matrix()
 
+
+
 }
 \seealso{
 \code{\link{load_taxonomic_resources}}
diff --git a/man/create_taxonomic_update_lookup.Rd b/man/create_taxonomic_update_lookup.Rd
index b0bed753..c2dd5906 100644
--- a/man/create_taxonomic_update_lookup.Rd
+++ b/man/create_taxonomic_update_lookup.Rd
@@ -11,7 +11,8 @@ create_taxonomic_update_lookup(
   one_to_many = "return_all",
   full = FALSE,
   resources = load_taxonomic_resources(stable_or_current_data = stable_or_current_data,
-    version = version)
+    version = version),
+  output = NULL
 )
 }
 \arguments{
@@ -21,11 +22,13 @@ create_taxonomic_update_lookup(
 
 \item{version}{The version number of the dataset to use.}
 
-\item{one_to_many}{How to handle one_to_many taxonomic matches.  Default is "return_all".  The other options are "collapse_to_higher_taxon" and "most likely species". Most likely species defaults to the original_name if that name is accepted by the APC.}
+\item{one_to_many}{How to handle one_to_many taxonomic matches.  Default is "return_all".  The other options are "collapse_to_higher_taxon" and "most_likely_species". most_likely_species defaults to the original_name if that name is accepted by the APC; this will be right for certain species subsets, but make errors in other cases, use with caution.}
 
 \item{full}{logical for whether the full lookup table is returned or just the two key columns}
 
-\item{resources}{These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer.  If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} seperately and pass the data in.}
+\item{resources}{These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer.  If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in.}
+
+\item{output}{file path to save the intermediate output to}
 }
 \value{
 A lookup table containing the original species names, the aligned species names, and additional taxonomic information such as taxon IDs and genera.