Skip to content

Commit

Permalink
Added report template within the package TODO - more explanation with…
Browse files Browse the repository at this point in the history
…in the report to explain how to use it...
  • Loading branch information
Edouard-Legoupil committed Nov 7, 2023
1 parent aefa89b commit 74d6325
Show file tree
Hide file tree
Showing 92 changed files with 150,104 additions and 25,990 deletions.
6 changes: 6 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
^LICENSE\.md$
^\.github$
^_pkgdown\.yml$
^codecov\.yml$
^README\.Rmd$
^setting_testthat\.R$
^docs$
^data-raw$
^dev$
^pkgdown$
^CODE_OF_CONDUCT\.md$
^cleaningtools_ed\.Rproj$
9 changes: 6 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
Package: cleaningtools
Title: cleaningtools package focuses on data cleaning
Version: 0.0.0.9002
Version: 0.0.0.9003
Authors@R: c(
person("Mehedi", "Khan", , "mh.khan@reach-initiative.org", role = "aut",
comment = c(ORCID = "0000-0002-4276-4485")),
person("Yann", "Say", , "yann.say@impact-initiatives.org", role = c("cre", "aut"),
comment = c(ORCID = "0000-0002-7390-4209"))
comment = c(ORCID = "0000-0002-7390-4209")),
person("Edouard", "Legoupil", , "legoupil@nhcr.org", role = "ctb")
)
Description: The cleaningtools package focuses on cleaning, and has three
components: **Check**, which includes a set of functions that flag
Expand All @@ -32,11 +33,13 @@ Imports:
randomcoloR,
rlang,
snakecase,
stats,
stringi,
stringr,
tibble,
tidyr,
tidyselect
tidyselect,
utils
Suggests:
rmarkdown,
testthat (>= 3.0.0),
Expand Down
14 changes: 10 additions & 4 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
# cleaningtools (development version)
# cleaningtools 0.0.0.9003

Code review - transferred all function to a fusen notebook to review checking with examples
* Code review - transferred all function to a [fusen notebook](https://thinkr-open.github.io/fusen/index.html) to review checking with examples. All dev can be done there to enforce better documentation

fixed globals.R
* fixed globals.R and few mising library import

* added a built-in report template with all the function parameters inst/rmarkdown/templates/clean/skeleton/skeleton.Rmd

* added demo files within a data-raw folder directly as xlsx files

# cleaningtools 0.0.0.9002

* Initial CRAN submission.
Initial version
4 changes: 2 additions & 2 deletions R/add_duration_from_audit.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,12 @@ add_duration_from_audit <- function(dataset,

if (exists("duration_with_sum_all")) {
dataset <- dataset %>%
dplyr::left_join(duration_with_sum_all, by = setNames("uuid", uuid_column))
dplyr::left_join(duration_with_sum_all, by = stats::setNames("uuid", uuid_column))
}

if (exists("duration_with_start_end")) {
dataset <- dataset %>%
dplyr::left_join(duration_with_start_end, by = setNames("uuid", uuid_column))
dplyr::left_join(duration_with_start_end, by = stats::setNames("uuid", uuid_column))
}

return(dataset)
Expand Down
10 changes: 5 additions & 5 deletions R/check_outliers.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ check_outliers <- function(dataset,
#######################


dataset <- type.convert(dataset, as.is = TRUE, na.string = c("", " ")) |>
dataset <- utils::type.convert(dataset, as.is = TRUE, na.string = c("", " ")) |>
dplyr::rename(
uuid = !!rlang::sym(uuid_column)
) |>
Expand Down Expand Up @@ -154,13 +154,13 @@ check_outliers <- function(dataset,
variable_value <- variable_value[!is.na(variable_value) & !is.null(variable_value) & !is.infinite(variable_value)]

if (!is.null(minimum_unique_value_of_variable)) {
outliers_tf_nr <- (abs(variable_value - mean(variable_value)) > strongness_factor * sd(variable_value)) &
outliers_tf_nr <- (abs(variable_value - mean(variable_value)) > strongness_factor * stats::sd(variable_value)) &
(length(unique(variable_value)) > minimum_unique_value_of_variable)
}


if (is.null(minimum_unique_value_of_variable)) {
outliers_tf_nr <- abs(variable_value - mean(variable_value)) > strongness_factor * sd(variable_value)
outliers_tf_nr <- abs(variable_value - mean(variable_value)) > strongness_factor * stats::sd(variable_value)
}


Expand Down Expand Up @@ -188,12 +188,12 @@ check_outliers <- function(dataset,
log_variable <- log_variable[!is.na(log_variable) & !is.null(log_variable) & !is.infinite(log_variable)]

if (!is.null(minimum_unique_value_of_variable)) {
outliers_tf <- abs(log_variable - mean(log_variable)) > strongness_factor * sd(log_variable) &
outliers_tf <- abs(log_variable - mean(log_variable)) > strongness_factor * stats::sd(log_variable) &
length(unique(log_variable)) > minimum_unique_value_of_variable
}

if (is.null(minimum_unique_value_of_variable)) {
outliers_tf <- abs(log_variable - mean(log_variable)) > strongness_factor * sd(log_variable)
outliers_tf <- abs(log_variable - mean(log_variable)) > strongness_factor * stats::sd(log_variable)
}


Expand Down
4 changes: 2 additions & 2 deletions R/create_audit_list.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
create_audit_list <- function(audit_zip_path,
dataset = NULL,
uuid_column = "uuid") {
list_of_files <- unzip(audit_zip_path, list = TRUE) %>%
list_of_files <- utils::unzip(audit_zip_path, list = TRUE) %>%
dplyr::rename(path = Name) %>%
dplyr::filter(stringr::str_detect(path, pattern = "audit.csv"))

locatation_audit <- list_of_files %>%
dplyr::pull(path) %>%
stringr::str_split("/") %>%
purrr::set_names(list_of_files$path) %>%
purrr:::map_dbl(~ which(stringr::str_detect(.x, "audit.csv"))) %>%
purrr::map_dbl(~ which(stringr::str_detect(.x, "audit.csv"))) %>%
unique()

if (length(locatation_audit) != 1) {
Expand Down
2 changes: 1 addition & 1 deletion R/create_clean_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -170,5 +170,5 @@ create_clean_data <- function(raw_dataset,
print("no surveys to remove in log")
}

return(raw_dataset %>% type.convert(as.is = T))
return(raw_dataset %>% utils::type.convert(as.is = T))
}
4 changes: 2 additions & 2 deletions R/create_validation_list.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ create_validation_list <- function(choices, tool) {
) %>%
do.call(rbind, .) %>%
as.data.frame() %>%
setNames(c("name", "choices"))
stats::setNames(c("name", "choices"))

choicelist <- new_lists %>%
dplyr::bind_rows(create_formatted_choices(choices, tool) %>%
Expand All @@ -38,7 +38,7 @@ create_validation_list <- function(choices, tool) {
choice_validation <- choicelist %>%
unique() %>%
data.table::transpose() %>%
setNames(.[1, ]) %>%
stats::setNames(.[1, ]) %>%
dplyr::slice(-1) %>%
dplyr::mutate_all(~ stringr::str_split(., ";\n"))

Expand Down
2 changes: 2 additions & 0 deletions R/detect_uuid.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#' If there is 0 or 2 or more possible ansers, it will
#' give an error.
#'
#' @keywords internal
#'
#' @export
#' @examples
#' test1_df <- data.frame(uuid = letters,
Expand Down
2 changes: 2 additions & 0 deletions R/knit_big_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#' @param .height Height in the html output
#'
#' @return Table with some features for quarto html output
#'
#' @keywords internal
#' @export
#' @examples
#' #knit_big_table(table_to_print)
Expand Down
5 changes: 4 additions & 1 deletion R/print_log.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
#' @param log_to_print A data frame to print
#' @param message A message to print if the data frame is empty
#'
#' @return The data frame to print or an alternative message if the dataframe is empty
#' @return The data frame to print or an alternative message
#' if the dataframe is empty
#'
#' @keywords internal
#' @export
#' @examples
#' #print_log(log_to_print, "No values flagged")
Expand Down
2 changes: 1 addition & 1 deletion R/review_sample_frame_with_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ review_sample_frame_with_dataset <- function(sample_frame,

sample_frame |>
dplyr::select(-dplyr::any_of(c("Collected", "Remaining"))) |>
dplyr::left_join(actual_df, by = setNames(clean_dataset_strata_column, sampling_frame_strata_column)) |>
dplyr::left_join(actual_df, by = stats::setNames(clean_dataset_strata_column, sampling_frame_strata_column)) |>
dplyr::mutate(Collected = dplyr::case_when(is.na(Collected) ~ 0, T ~ Collected)) |>
dplyr::mutate(
Remaining = !!rlang::sym(sampling_frame_target_survey_column) - Collected
Expand Down
4 changes: 4 additions & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@ reference:
desc: functions building cleaning
- contents:
- starts_with("coerce_")
- title: checks
desc: functions building cleaning
- contents:
- starts_with("check_")
Binary file not shown.
35 changes: 21 additions & 14 deletions dev/function_documentation.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -452,12 +452,12 @@ add_duration_from_audit <- function(dataset,
if (exists("duration_with_sum_all")) {
dataset <- dataset %>%
dplyr::left_join(duration_with_sum_all, by = setNames("uuid", uuid_column))
dplyr::left_join(duration_with_sum_all, by = stats::setNames("uuid", uuid_column))
}
if (exists("duration_with_start_end")) {
dataset <- dataset %>%
dplyr::left_join(duration_with_start_end, by = setNames("uuid", uuid_column))
dplyr::left_join(duration_with_start_end, by = stats::setNames("uuid", uuid_column))
}
return(dataset)
Expand Down Expand Up @@ -553,15 +553,15 @@ test_that("add_duration_from_audit works", {
create_audit_list <- function(audit_zip_path,
dataset = NULL,
uuid_column = "uuid") {
list_of_files <- unzip(audit_zip_path, list = TRUE) %>%
list_of_files <- utils::unzip(audit_zip_path, list = TRUE) %>%
dplyr::rename(path = Name) %>%
dplyr::filter(stringr::str_detect(path, pattern = "audit.csv"))
locatation_audit <- list_of_files %>%
dplyr::pull(path) %>%
stringr::str_split("/") %>%
purrr::set_names(list_of_files$path) %>%
purrr:::map_dbl(~ which(stringr::str_detect(.x, "audit.csv"))) %>%
purrr::map_dbl(~ which(stringr::str_detect(.x, "audit.csv"))) %>%
unique()
if (length(locatation_audit) != 1) {
Expand Down Expand Up @@ -1460,7 +1460,7 @@ check_outliers <- function(dataset,
#######################
dataset <- type.convert(dataset, as.is = TRUE, na.string = c("", " ")) |>
dataset <- utils::type.convert(dataset, as.is = TRUE, na.string = c("", " ")) |>
dplyr::rename(
uuid = !!rlang::sym(uuid_column)
) |>
Expand Down Expand Up @@ -1538,13 +1538,13 @@ check_outliers <- function(dataset,
variable_value <- variable_value[!is.na(variable_value) & !is.null(variable_value) & !is.infinite(variable_value)]
if (!is.null(minimum_unique_value_of_variable)) {
outliers_tf_nr <- (abs(variable_value - mean(variable_value)) > strongness_factor * sd(variable_value)) &
outliers_tf_nr <- (abs(variable_value - mean(variable_value)) > strongness_factor * stats::sd(variable_value)) &
(length(unique(variable_value)) > minimum_unique_value_of_variable)
}
if (is.null(minimum_unique_value_of_variable)) {
outliers_tf_nr <- abs(variable_value - mean(variable_value)) > strongness_factor * sd(variable_value)
outliers_tf_nr <- abs(variable_value - mean(variable_value)) > strongness_factor * stats::sd(variable_value)
}
Expand Down Expand Up @@ -1572,12 +1572,12 @@ check_outliers <- function(dataset,
log_variable <- log_variable[!is.na(log_variable) & !is.null(log_variable) & !is.infinite(log_variable)]
if (!is.null(minimum_unique_value_of_variable)) {
outliers_tf <- abs(log_variable - mean(log_variable)) > strongness_factor * sd(log_variable) &
outliers_tf <- abs(log_variable - mean(log_variable)) > strongness_factor * stats::sd(log_variable) &
length(unique(log_variable)) > minimum_unique_value_of_variable
}
if (is.null(minimum_unique_value_of_variable)) {
outliers_tf <- abs(log_variable - mean(log_variable)) > strongness_factor * sd(log_variable)
outliers_tf <- abs(log_variable - mean(log_variable)) > strongness_factor * stats::sd(log_variable)
}
Expand Down Expand Up @@ -2231,7 +2231,7 @@ create_clean_data <- function(raw_dataset,
print("no surveys to remove in log")
}
return(raw_dataset %>% type.convert(as.is = T))
return(raw_dataset %>% utils::type.convert(as.is = T))
}
```

Expand Down Expand Up @@ -3774,7 +3774,7 @@ review_sample_frame_with_dataset <- function(sample_frame,
sample_frame |>
dplyr::select(-dplyr::any_of(c("Collected", "Remaining"))) |>
dplyr::left_join(actual_df, by = setNames(clean_dataset_strata_column, sampling_frame_strata_column)) |>
dplyr::left_join(actual_df, by = stats::setNames(clean_dataset_strata_column, sampling_frame_strata_column)) |>
dplyr::mutate(Collected = dplyr::case_when(is.na(Collected) ~ 0, T ~ Collected)) |>
dplyr::mutate(
Remaining = !!rlang::sym(sampling_frame_target_survey_column) - Collected
Expand Down Expand Up @@ -4270,7 +4270,7 @@ create_validation_list <- function(choices, tool) {
) %>%
do.call(rbind, .) %>%
as.data.frame() %>%
setNames(c("name", "choices"))
stats::setNames(c("name", "choices"))
choicelist <- new_lists %>%
dplyr::bind_rows(create_formatted_choices(choices, tool) %>%
Expand All @@ -4279,7 +4279,7 @@ create_validation_list <- function(choices, tool) {
choice_validation <- choicelist %>%
unique() %>%
data.table::transpose() %>%
setNames(.[1, ]) %>%
stats::setNames(.[1, ]) %>%
dplyr::slice(-1) %>%
dplyr::mutate_all(~ stringr::str_split(., ";\n"))
Expand Down Expand Up @@ -4455,6 +4455,8 @@ test_that("coerce_to_character works", {
#' If there is 0 or 2 or more possible ansers, it will
#' give an error.
#'
#' @keywords internal
#'
#' @export
Expand Down Expand Up @@ -4497,6 +4499,8 @@ test_that("detect_uuid works", {
#' @param .height Height in the html output
#'
#' @return Table with some features for quarto html output
#'
#' @keywords internal
#' @export
knit_big_table <- function(table_to_print, .height = "500px") {
Expand Down Expand Up @@ -4526,7 +4530,10 @@ test_that("knit_big_table works", {
#' @param log_to_print A data frame to print
#' @param message A message to print if the data frame is empty
#'
#' @return The data frame to print or an alternative message if the dataframe is empty
#' @return The data frame to print or an alternative message
#' if the dataframe is empty
#'
#' @keywords internal
#' @export
print_log <- function(log_to_print, message) {
Expand Down
2 changes: 2 additions & 0 deletions docs/404.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions docs/CODE_OF_CONDUCT.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions docs/LICENSE-text.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions docs/LICENSE.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 74d6325

Please sign in to comment.