Add austraits functions to dataset_test and allow trait values to…

… be excluded in `exclude_observations` (#123) - Moved austraits functions `trait_pivot_wider` and `trait_pivot_longer` to traits.build in `pivot.R` and added testing within `dataset_test` - Moved code for testing pivot wider into standalone function called `check_pivot_wider()` - Allowed trait values to be excluded in `exclude_observations`, where previously only metadata fields/locations, etc. could be excluded - Added extra tests in `dataset_test` for `exclude_observations` and `taxonomic_updates` sections - Replaced `testthat` functions with own functions to enable customisation of failure messages - Regenerated `taxon_list.csv` for test datasets and documented method
traitecoevo · Nov 15, 2023 · 9a9be4c · 9a9be4c
1 parent 27ecb28
commit 9a9be4c
Show file tree

Hide file tree

Showing 43 changed files with 1,797 additions and 1,181 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,13 +5,16 @@ export(build_add_version)
 export(build_combine)
 export(build_setup_pipeline)
 export(check_duplicates)
+export(check_pivot_wider)
 export(dataset_build)
 export(dataset_configure)
 export(dataset_find_taxon)
 export(dataset_process)
 export(dataset_report)
 export(dataset_test)
 export(dataset_update_taxonomy)
+export(db_traits_pivot_longer)
+export(db_traits_pivot_wider)
 export(get_schema)
 export(get_unit_conversions)
 export(metadata_add_contexts)

diff --git a/R/pivot.R b/R/pivot.R
@@ -0,0 +1,138 @@
+#' @title Test whether a dataset can pivot wider
+#'
+#' @description Test whether the traits table of a dataset can pivot wider with the minimum required columns.
+#'
+#' @param dataset Built dataset with `test_build_dataset`
+#'
+#' @return Number of rows with duplicates preventing pivoting wider
+#' @export
+
+check_pivot_wider <- function(dataset) {
+
+  duplicates <- dataset$traits %>%
+    select(
+      dplyr::all_of(c("dataset_id", "trait_name", "value", "observation_id", "value_type",
+      "repeat_measurements_id", "method_id", "method_context_id"))
+    ) %>%
+    tidyr::pivot_wider(names_from = "trait_name", values_from = "value", values_fn = length) %>%
+    tidyr::pivot_longer(cols = 7:ncol(.)) %>%
+    dplyr::rename(dplyr::all_of(c("trait_name" = "name", "number_of_duplicates" = "value"))) %>%
+    select(
+      dplyr::all_of(c("dataset_id", "trait_name", "number_of_duplicates", "observation_id",
+      "value_type")), everything()
+    ) %>%
+    filter(.data$number_of_duplicates > 1) %>%
+    nrow()
+
+  if (duplicates == 0) {
+    invisible(TRUE)
+  } else {
+     invisible(FALSE)
+  }
+
+}
+
+
+#' @title Pivot long format data into a wide format
+#'
+#' @description `trait_pivot_wider` "widens" long format data ("tidy data").
+#'
+#' Databases built with `traits.build` are organised in a long format where observations are on different rows and the
+#' type of observation is denoted by various identifying columns (e.g `trait_name`, `dataset_id`,
+#' `observation_id`, etc.).
+#' This function converts the data into wide format so that each trait in its own column.
+#'
+#' @param traits The traits table from database (list object)
+#' @return A tibble in wide format
+#' @details `trait_pivot_wider` will return a single wide tibble; note that some meta-data columns
+#' (unit, replicates, measurement_remarks, basis_of_record, basis_of_value) will be excluded to
+#' produce a useful wide tibble.
+#' @examples
+#' \dontrun{
+#' data <- austraits$traits %>% filter(dataset_id == "Falster_2003")
+#' data # Long format
+#' traits_wide <- trait_pivot_wider(data)
+#' traits_wide # Wide format
+#' }
+#' @author Daniel Falster - daniel.falster@unsw.edu.au
+#' @export
+db_traits_pivot_wider <- function(traits) {
+
+  metadata_cols <- c("unit", "replicates", "measurement_remarks", "basis_of_value")
+
+  # A check for if there are more than 1 value_type for a given taxon_name, observation_id and method
+  check_value_type <- traits %>%
+    select(dplyr::all_of(c(
+      "trait_name", "value", "dataset_id", "observation_id", "method_id", "method_context_id",
+      "repeat_measurements_id", "value_type"))) %>%
+    dplyr::group_by(
+      .data$dataset_id, .data$observation_id, .data$method_id,
+      .data$method_context_id, .data$repeat_measurements_id) %>%
+    dplyr::summarise(n_value_type = length(unique(.data$value_type))) %>%
+    arrange(.data$observation_id) %>%
+    dplyr::filter(.data$n_value_type > 1)
+
+  if (nrow(check_value_type) > 1) {
+
+    traits %>%
+      tidyr::pivot_wider(
+        names_from = "trait_name",
+        values_from = "value",
+        id_cols = -dplyr::all_of(metadata_cols)
+      )
+
+  } else {
+
+    metadata_cols <- c(metadata_cols, "value_type")
+
+    traits %>%
+      tidyr::pivot_wider(
+        names_from = "trait_name",
+        values_from = "value",
+        id_cols = -dplyr::all_of(metadata_cols)
+      )
+  }
+
+}
+
+
+#' @title Pivot wide format data into a long format
+#'
+#' @description `trait_pivot_longer` "gathers" wide format data into a "tidy" format.
+#'
+#' This function converts the data into long format where observations are on different rows and the type of
+#' observation is denoted by the `trait_name` column.
+#' In other words, `trait_pivot_longer` reverts the actions of `trait_pivot_wider`.
+#' @param wide_data Output from `trait_pivot_wider` (a tibble of wide data)
+#' @return A tibble in long format
+#' @details
+#' `trait_pivot_longer` will return a tibble with fewer columns than the original traits table
+#' The excluded columns include: "unit", "replicates", "measurement_remarks", "basis_of_record",
+#' "basis_of_value" # Double check #TODO
+#'
+#' @examples
+#' \dontrun{
+#' data <- austraits$traits %>%
+#' filter(dataset_id == "Falster_2003")
+#' data # Long format
+#' traits_wide <- trait_pivot_wider(data)
+#' traits_wide # Wide format
+#'
+#' values_long <- trait_pivot_longer(traits_wide)
+#' }
+#' @author Daniel Falster - daniel.falster@unsw.edu.au
+#' @author Fonti Kar - fonti.kar@unsw.edu.au
+#' @export
+db_traits_pivot_longer <- function(wide_data) {
+
+  # The start of the trait columns is after `original_name`
+  start_of_trait_cols <- which(names(wide_data) == "original_name") + 1
+
+  wide_data %>%
+    tidyr::pivot_longer(
+      cols = start_of_trait_cols:ncol(.),
+      names_to = "trait_name",
+      values_drop_na = TRUE
+    )
+
+}
diff --git a/R/process.R b/R/process.R
@@ -115,7 +115,7 @@ dataset_process <- function(filename_data_raw,
         "parsing_id", "location_name", "taxonomic_resolution", "methods", "unit_in")
     )
 
-  # Replace location_name with a location_id
+  # Replace old `location_id` with a new `location_id`
   if (nrow(locations) > 0) {
     traits <-
       traits %>%
@@ -900,12 +900,25 @@ process_flag_excluded_observations <- function(data, metadata) {
 
   fix <- split(fix, fix$variable)
 
+  traits <- metadata$traits %>% util_list_to_df2
+
   for (v in names(fix))
 
-    data <- data %>%
-      dplyr::mutate(
-        error = ifelse(.data[[v]] %in% fix[[v]]$find,
-        "Observation excluded in metadata", .data$error))
+    if (v %in% traits$trait_name) {
+      data <- data %>%
+        dplyr::mutate(
+          error = ifelse(
+            .data$trait_name == v & .data$value %in% fix[[v]]$find,
+            "Observation excluded in metadata",
+            .data$error))
+    } else {
+      data <- data %>%
+        dplyr::mutate(
+          error = ifelse(
+            .data[[v]] %in% fix[[v]]$find,
+            "Observation excluded in metadata",
+            .data$error))
+    }
 
   data
 }

diff --git a/R/reports.R b/R/reports.R
@@ -1,19 +1,19 @@
 
 #' Build reports for listed datasets
 #'
-#' Builds a detailed report for every dataset with a unique `dataset_id`, based on the template Rmd file provided.  The reports are
-#' rendered as html files and saved in the specified output folder.
+#' Builds a detailed report for every dataset with a unique `dataset_id`, based on the template Rmd file provided.
+#' The reports are rendered as html files and saved in the specified output folder.
 #'
-#' @param dataset_id name of specific study/dataset
-#' @param austraits compiled austraits database
-#' @param overwrite logical value to determine whether to overwrite existing report,
-#' @param output_path location where rendered report will be saved
-#' @param input_file report script (.Rmd) file to build study report
-#' @param quiet An option to suppress printing during rendering from knitr, pandoc command line and others.
-#' @param keep keep intermediate Rmd file used?
+#' @param dataset_id Name of specific study/dataset
+#' @param austraits Compiled austraits database
+#' @param overwrite Logical value to determine whether to overwrite existing report
+#' @param output_path Location where rendered report will be saved
+#' @param input_file Report script (.Rmd) file to build study report
+#' @param quiet An option to suppress printing during rendering from knitr, pandoc command line and others
+#' @param keep Keep intermediate Rmd file used?
 #'
 #' @rdname dataset_report
-#' @return html file of the rendered report located in the specified output folder.
+#' @return Html file of the rendered report located in the specified output folder
 #' @export
 dataset_report <- function(dataset_id, austraits, overwrite = FALSE,
                            output_path = "export/reports",
@@ -41,7 +41,7 @@ dataset_report_worker <- function(dataset_id, austraits, overwrite = FALSE,
     dir.create(output_path, FALSE, TRUE)
   }
 
-  # filenames
+  # Filenames
   input_Rmd <- sprintf("tmp_%s_report.Rmd", dataset_id)
   output_html <- sprintf("%s/%s.html", output_path, dataset_id)
 
@@ -54,7 +54,7 @@ dataset_report_worker <- function(dataset_id, austraits, overwrite = FALSE,
     x[2] <- sprintf("title: Report on study `%s` from", dataset_id)
     writeLines(x, input_Rmd)
 
-    # knit and render. Note, call render directly
+    # Knit and render. Note, call render directly
     # in preference to knit, then render, as leaflet widget
     # requires this to work
     # Warning: result assigned but may not be used
@@ -82,7 +82,7 @@ dataset_report_worker <- function(dataset_id, austraits, overwrite = FALSE,
 
 #' Format table with kable and default styling for html
 #'
-#' @param ... arguments passed to `kableExtra::kable()`
+#' @param ... Arguments passed to `kableExtra::kable()`
 #' @importFrom rlang .data
 #' @export
 util_kable_styling_html <- function(...) {
@@ -94,6 +94,6 @@ util_kable_styling_html <- function(...) {
                   position = "left"
                   )
 
-    # hack to add margin to plot
+    # Hack to add margin to plot
     gsub('style="width: auto ', 'style="margin-left:30px; width: auto ', txt)
 }
diff --git a/R/setup.R b/R/setup.R
@@ -9,7 +9,7 @@ metadata_path_dataset_id <- function(dataset_id) {
 
 #' Create a template of file `metadata.yml` for specified `dataset_id`
 #'
-#' Includes place-holders for major sections of the metadata
+#' Includes place-holders for major sections of the metadata.
 #'
 #' @inheritParams metadata_path_dataset_id
 #' @param path Location of file where output is saved
@@ -869,7 +869,7 @@ metadata_add_substitutions_table <- function(dataframe_of_substitutions, dataset
 #' @param reason Reason for taxonomic change
 #' @param taxonomic_resolution The rank of the most specific taxon name (or scientific name)
 #' to which a submitted orignal name resolves
-#' @param overwrite Parameter indicating whether preexisting find-replace entries should be overwritten. Defaults to `true` 
+#' @param overwrite Parameter indicating whether preexisting find-replace entries should be overwritten. Defaults to `true`
 #'
 #' @return `metadata.yml` file with taxonomic change added
 #' @export
@@ -888,7 +888,7 @@ metadata_add_taxonomic_change <- function(dataset_id, find, replace, reason, tax
   if (all(is.na(metadata[[set_name]]))) {
     data <- to_add
   } else {
-    data <- util_list_to_df2(metadata[[set_name]]) 
+    data <- util_list_to_df2(metadata[[set_name]])
     # Check if find record already exists for that trait
     if (find %in% data$find) {
       # If overwrite set to false, don't add a new substitution
@@ -897,8 +897,8 @@ metadata_add_taxonomic_change <- function(dataset_id, find, replace, reason, tax
         return(invisible())
       # Default is to overwrite existing substitution
       } else {
-        message(sprintf(red("Existing substitution will be overwritten for ") %+% green("'%s'"), find))     
-        data <- data %>% 
+        message(sprintf(red("Existing substitution will be overwritten for ") %+% green("'%s'"), find))
+        data <- data %>%
                   filter(.data$find != to_add$find) %>%
                   dplyr::bind_rows(to_add) %>%
                   filter(!.data$find == replace) %>%