Skip to content

Commit

Permalink
Add austraits functions to dataset_test and allow trait values to…
Browse files Browse the repository at this point in the history
… be excluded in `exclude_observations` (#123)

- Moved austraits functions `trait_pivot_wider` and `trait_pivot_longer` to traits.build in `pivot.R` and added testing within `dataset_test`
- Moved code for testing pivot wider into standalone function called `check_pivot_wider()`
- Allowed trait values to be excluded in `exclude_observations`, where previously only metadata fields/locations, etc. could be excluded
- Added extra tests in `dataset_test` for `exclude_observations` and `taxonomic_updates` sections
- Replaced `testthat` functions with own functions to enable customisation of failure messages
- Regenerated `taxon_list.csv` for test datasets and documented method
  • Loading branch information
yangsophieee authored Nov 15, 2023
1 parent 27ecb28 commit 9a9be4c
Show file tree
Hide file tree
Showing 43 changed files with 1,797 additions and 1,181 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@ export(build_add_version)
export(build_combine)
export(build_setup_pipeline)
export(check_duplicates)
export(check_pivot_wider)
export(dataset_build)
export(dataset_configure)
export(dataset_find_taxon)
export(dataset_process)
export(dataset_report)
export(dataset_test)
export(dataset_update_taxonomy)
export(db_traits_pivot_longer)
export(db_traits_pivot_wider)
export(get_schema)
export(get_unit_conversions)
export(metadata_add_contexts)
Expand Down
138 changes: 138 additions & 0 deletions R/pivot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#' @title Test whether a dataset can pivot wider
#'
#' @description Test whether the traits table of a dataset can pivot wider with the minimum required columns.
#'
#' @param dataset Built dataset with `test_build_dataset`
#'
#' @return Number of rows with duplicates preventing pivoting wider
#' @export

check_pivot_wider <- function(dataset) {

duplicates <- dataset$traits %>%
select(
dplyr::all_of(c("dataset_id", "trait_name", "value", "observation_id", "value_type",
"repeat_measurements_id", "method_id", "method_context_id"))
) %>%
tidyr::pivot_wider(names_from = "trait_name", values_from = "value", values_fn = length) %>%
tidyr::pivot_longer(cols = 7:ncol(.)) %>%
dplyr::rename(dplyr::all_of(c("trait_name" = "name", "number_of_duplicates" = "value"))) %>%
select(
dplyr::all_of(c("dataset_id", "trait_name", "number_of_duplicates", "observation_id",
"value_type")), everything()
) %>%
filter(.data$number_of_duplicates > 1) %>%
nrow()

if (duplicates == 0) {
invisible(TRUE)
} else {
invisible(FALSE)
}

}


#' @title Pivot long format data into a wide format
#'
#' @description `trait_pivot_wider` "widens" long format data ("tidy data").
#'
#' Databases built with `traits.build` are organised in a long format where observations are on different rows and the
#' type of observation is denoted by various identifying columns (e.g `trait_name`, `dataset_id`,
#' `observation_id`, etc.).
#' This function converts the data into wide format so that each trait in its own column.
#'
#' @param traits The traits table from database (list object)
#' @return A tibble in wide format
#' @details `trait_pivot_wider` will return a single wide tibble; note that some meta-data columns
#' (unit, replicates, measurement_remarks, basis_of_record, basis_of_value) will be excluded to
#' produce a useful wide tibble.
#' @examples
#' \dontrun{
#' data <- austraits$traits %>% filter(dataset_id == "Falster_2003")
#' data # Long format
#' traits_wide <- trait_pivot_wider(data)
#' traits_wide # Wide format
#' }
#' @author Daniel Falster - daniel.falster@unsw.edu.au
#' @export
db_traits_pivot_wider <- function(traits) {

metadata_cols <- c("unit", "replicates", "measurement_remarks", "basis_of_value")

# A check for if there are more than 1 value_type for a given taxon_name, observation_id and method
check_value_type <- traits %>%
select(dplyr::all_of(c(
"trait_name", "value", "dataset_id", "observation_id", "method_id", "method_context_id",
"repeat_measurements_id", "value_type"))) %>%
dplyr::group_by(
.data$dataset_id, .data$observation_id, .data$method_id,
.data$method_context_id, .data$repeat_measurements_id) %>%
dplyr::summarise(n_value_type = length(unique(.data$value_type))) %>%
arrange(.data$observation_id) %>%
dplyr::filter(.data$n_value_type > 1)

if (nrow(check_value_type) > 1) {

traits %>%
tidyr::pivot_wider(
names_from = "trait_name",
values_from = "value",
id_cols = -dplyr::all_of(metadata_cols)
)

} else {

metadata_cols <- c(metadata_cols, "value_type")

traits %>%
tidyr::pivot_wider(
names_from = "trait_name",
values_from = "value",
id_cols = -dplyr::all_of(metadata_cols)
)
}

}


#' @title Pivot wide format data into a long format
#'
#' @description `trait_pivot_longer` "gathers" wide format data into a "tidy" format.
#'
#' This function converts the data into long format where observations are on different rows and the type of
#' observation is denoted by the `trait_name` column.
#' In other words, `trait_pivot_longer` reverts the actions of `trait_pivot_wider`.
#' @param wide_data Output from `trait_pivot_wider` (a tibble of wide data)
#' @return A tibble in long format
#' @details
#' `trait_pivot_longer` will return a tibble with fewer columns than the original traits table
#' The excluded columns include: "unit", "replicates", "measurement_remarks", "basis_of_record",
#' "basis_of_value" # Double check #TODO
#'
#' @examples
#' \dontrun{
#' data <- austraits$traits %>%
#' filter(dataset_id == "Falster_2003")
#' data # Long format
#' traits_wide <- trait_pivot_wider(data)
#' traits_wide # Wide format
#'
#' values_long <- trait_pivot_longer(traits_wide)
#' }
#' @author Daniel Falster - daniel.falster@unsw.edu.au
#' @author Fonti Kar - fonti.kar@unsw.edu.au
#' @export
db_traits_pivot_longer <- function(wide_data) {

# The start of the trait columns is after `original_name`
start_of_trait_cols <- which(names(wide_data) == "original_name") + 1

wide_data %>%
tidyr::pivot_longer(
cols = start_of_trait_cols:ncol(.),
names_to = "trait_name",
values_drop_na = TRUE
)

}
23 changes: 18 additions & 5 deletions R/process.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ dataset_process <- function(filename_data_raw,
"parsing_id", "location_name", "taxonomic_resolution", "methods", "unit_in")
)

# Replace location_name with a location_id
# Replace old `location_id` with a new `location_id`
if (nrow(locations) > 0) {
traits <-
traits %>%
Expand Down Expand Up @@ -900,12 +900,25 @@ process_flag_excluded_observations <- function(data, metadata) {

fix <- split(fix, fix$variable)

traits <- metadata$traits %>% util_list_to_df2

for (v in names(fix))

data <- data %>%
dplyr::mutate(
error = ifelse(.data[[v]] %in% fix[[v]]$find,
"Observation excluded in metadata", .data$error))
if (v %in% traits$trait_name) {
data <- data %>%
dplyr::mutate(
error = ifelse(
.data$trait_name == v & .data$value %in% fix[[v]]$find,
"Observation excluded in metadata",
.data$error))
} else {
data <- data %>%
dplyr::mutate(
error = ifelse(
.data[[v]] %in% fix[[v]]$find,
"Observation excluded in metadata",
.data$error))
}

data
}
Expand Down
28 changes: 14 additions & 14 deletions R/reports.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@

#' Build reports for listed datasets
#'
#' Builds a detailed report for every dataset with a unique `dataset_id`, based on the template Rmd file provided. The reports are
#' rendered as html files and saved in the specified output folder.
#' Builds a detailed report for every dataset with a unique `dataset_id`, based on the template Rmd file provided.
#' The reports are rendered as html files and saved in the specified output folder.
#'
#' @param dataset_id name of specific study/dataset
#' @param austraits compiled austraits database
#' @param overwrite logical value to determine whether to overwrite existing report,
#' @param output_path location where rendered report will be saved
#' @param input_file report script (.Rmd) file to build study report
#' @param quiet An option to suppress printing during rendering from knitr, pandoc command line and others.
#' @param keep keep intermediate Rmd file used?
#' @param dataset_id Name of specific study/dataset
#' @param austraits Compiled austraits database
#' @param overwrite Logical value to determine whether to overwrite existing report
#' @param output_path Location where rendered report will be saved
#' @param input_file Report script (.Rmd) file to build study report
#' @param quiet An option to suppress printing during rendering from knitr, pandoc command line and others
#' @param keep Keep intermediate Rmd file used?
#'
#' @rdname dataset_report
#' @return html file of the rendered report located in the specified output folder.
#' @return Html file of the rendered report located in the specified output folder
#' @export
dataset_report <- function(dataset_id, austraits, overwrite = FALSE,
output_path = "export/reports",
Expand Down Expand Up @@ -41,7 +41,7 @@ dataset_report_worker <- function(dataset_id, austraits, overwrite = FALSE,
dir.create(output_path, FALSE, TRUE)
}

# filenames
# Filenames
input_Rmd <- sprintf("tmp_%s_report.Rmd", dataset_id)
output_html <- sprintf("%s/%s.html", output_path, dataset_id)

Expand All @@ -54,7 +54,7 @@ dataset_report_worker <- function(dataset_id, austraits, overwrite = FALSE,
x[2] <- sprintf("title: Report on study `%s` from", dataset_id)
writeLines(x, input_Rmd)

# knit and render. Note, call render directly
# Knit and render. Note, call render directly
# in preference to knit, then render, as leaflet widget
# requires this to work
# Warning: result assigned but may not be used
Expand Down Expand Up @@ -82,7 +82,7 @@ dataset_report_worker <- function(dataset_id, austraits, overwrite = FALSE,

#' Format table with kable and default styling for html
#'
#' @param ... arguments passed to `kableExtra::kable()`
#' @param ... Arguments passed to `kableExtra::kable()`
#' @importFrom rlang .data
#' @export
util_kable_styling_html <- function(...) {
Expand All @@ -94,6 +94,6 @@ util_kable_styling_html <- function(...) {
position = "left"
)

# hack to add margin to plot
# Hack to add margin to plot
gsub('style="width: auto ', 'style="margin-left:30px; width: auto ', txt)
}
10 changes: 5 additions & 5 deletions R/setup.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata_path_dataset_id <- function(dataset_id) {

#' Create a template of file `metadata.yml` for specified `dataset_id`
#'
#' Includes place-holders for major sections of the metadata
#' Includes place-holders for major sections of the metadata.
#'
#' @inheritParams metadata_path_dataset_id
#' @param path Location of file where output is saved
Expand Down Expand Up @@ -869,7 +869,7 @@ metadata_add_substitutions_table <- function(dataframe_of_substitutions, dataset
#' @param reason Reason for taxonomic change
#' @param taxonomic_resolution The rank of the most specific taxon name (or scientific name)
#' to which a submitted orignal name resolves
#' @param overwrite Parameter indicating whether preexisting find-replace entries should be overwritten. Defaults to `true`
#' @param overwrite Parameter indicating whether preexisting find-replace entries should be overwritten. Defaults to `true`
#'
#' @return `metadata.yml` file with taxonomic change added
#' @export
Expand All @@ -888,7 +888,7 @@ metadata_add_taxonomic_change <- function(dataset_id, find, replace, reason, tax
if (all(is.na(metadata[[set_name]]))) {
data <- to_add
} else {
data <- util_list_to_df2(metadata[[set_name]])
data <- util_list_to_df2(metadata[[set_name]])
# Check if find record already exists for that trait
if (find %in% data$find) {
# If overwrite set to false, don't add a new substitution
Expand All @@ -897,8 +897,8 @@ metadata_add_taxonomic_change <- function(dataset_id, find, replace, reason, tax
return(invisible())
# Default is to overwrite existing substitution
} else {
message(sprintf(red("Existing substitution will be overwritten for ") %+% green("'%s'"), find))
data <- data %>%
message(sprintf(red("Existing substitution will be overwritten for ") %+% green("'%s'"), find))
data <- data %>%
filter(.data$find != to_add$find) %>%
dplyr::bind_rows(to_add) %>%
filter(!.data$find == replace) %>%
Expand Down
Loading

0 comments on commit 9a9be4c

Please sign in to comment.