Skip to content

Commit

Permalink
Fix the tidyselect feature bug (#95)
Browse files Browse the repository at this point in the history
* reconstruct read_slf to fix bug of selecting feature

* Update documentation

* remove TODO add a filter by recid as it has been done

* Style package

* update ep and individual file variables

---------

Co-authored-by: lizihao-anu <lizihao-anu@users.noreply.github.com>
Co-authored-by: Jennit07 <67372904+Jennit07@users.noreply.github.com>
  • Loading branch information
3 people authored Sep 9, 2024
1 parent e842787 commit fec975e
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 77 deletions.
135 changes: 66 additions & 69 deletions R/read_slf.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,74 +49,64 @@ read_slf <- function(
)
}

# If the we are trying to filter by partnership or recid
# but the column wasn't selected we need to add it (and remove later)
remove_partnership_var <- FALSE
remove_recid_var <- FALSE
if (!rlang::quo_is_null(rlang::enquo(col_select))) {
if (!is.null(partnerships) &&
stringr::str_detect(rlang::quo_text(rlang::enquo(col_select)),
stringr::coll("hscp2018"),
negate = TRUE
)) {
remove_partnership_var <- TRUE
}
if (!is.null(recids) && file_version == "episode" &&
stringr::str_detect(rlang::quo_text(rlang::enquo(col_select)),
stringr::coll("recid"),
negate = TRUE
)) {
remove_recid_var <- TRUE
}
}

slf_table <- purrr::map(
file_path,
function(file_path) {
slf_table <- arrow::read_parquet(
file = file_path,
slf_table <- arrow::read_parquet(file_path,
col_select = {{ col_select }},
as_data_frame = FALSE
)

if (!is.null(partnerships)) {
if (remove_partnership_var) {
slf_table <- cbind(
slf_table,
arrow::read_parquet(
file = file_path,
col_select = "hscp2018",
as_data_frame = FALSE
)
selected_columns <- names(slf_table)

# Check if recid/hscp is among the selected columns
recid_present <- "recid" %in% selected_columns
hscp_present <- "hscp2018" %in% selected_columns

# check if we need add extra recid/hscp to do filter
# remember to remove recid/hscp later
add_extra_recid <- !recid_present && !is.null(recids)
add_extra_hscp <- !hscp_present && !is.null(partnerships)

col_select2 <- if (add_extra_recid && add_extra_hscp) {
c("recid", "hscp2018")
} else if (add_extra_recid && !add_extra_hscp) {
c("recid")
} else if (!add_extra_recid && add_extra_hscp) {
c("hscp2018")
} else {
c("")
}

# If "recid" is not in col_select but was filtered by recids, ensure it's in the dataframe
if (col_select2 != "") {
# Read the "recid" and/or "hscp2018" column separately and
# bind with the filtered dataframe
slf_table <- slf_table %>% cbind( # bind_cols does not work
arrow::read_parquet(
file_path,
col_select = dplyr::all_of(col_select2),
as_data_frame = FALSE
)
}
slf_table <- dplyr::filter(
slf_table,
.data$hscp2018 %in% partnerships
)
if (remove_partnership_var) {
slf_table <- dplyr::select(slf_table, -"hscp2018")
}
}

# filter
if (!is.null(recids)) {
if (remove_recid_var) {
slf_table <- cbind(
slf_table,
arrow::read_parquet(
file = file_path,
col_select = "recid",
as_data_frame = FALSE
)
)
}
slf_table <- dplyr::filter(
slf_table,
.data$recid %in% recids
)
if (remove_recid_var) {
slf_table <- dplyr::select(slf_table, -"recid")
}
slf_table <- slf_table %>%
dplyr::filter(recid %in% recids)
}
if (!is.null(partnerships)) {
slf_table <- slf_table %>%
dplyr::filter(hscp2018 %in% partnerships)
}

# remove hscp recid
if (add_extra_recid) {
slf_table <- slf_table %>% dplyr::select(-c("recid"))
}
if (add_extra_hscp) {
slf_table <- slf_table %>% dplyr::select(-c("hscp2018"))
}

return(slf_table)
Expand Down Expand Up @@ -170,27 +160,34 @@ read_slf_episode <- function(
col_select <- columns
}
# TODO add option to drop blank CHIs?
# TODO add a filter by recid option
return(
read_slf(
year = year,
col_select = {{ col_select }},
file_version = "episode",
partnerships = unique(partnerships),
recids = unique(recids),
as_data_frame = as_data_frame,
dev = dev
)

data <- read_slf(
year = year,
col_select = {{ col_select }},
file_version = "episode",
partnerships = unique(partnerships),
recids = unique(recids),
as_data_frame = as_data_frame,
dev = dev
)

if ("keytime1" %in% colnames(data)) {
if (("keytime1" %in% names(data) | "keytime2" %in% names(data)) & !as_data_frame) {
warning('"keytime1" and "keytime2" does not work with `as_data_frame = FALSE` at the moment. So force as_data_frame = TRUE')
data <- data %>%
dplyr::collect()
}
if ("keytime1" %in% names(data)) {
data <- data %>%
dplyr::mutate(keytime1 = hms::as_hms(.data$keytime1))
}
if ("keytime2" %in% colnames(data)) {
if ("keytime2" %in% names(data)) {
data <- data %>%
dplyr::mutate(keytime2 = hms::as_hms(.data$keytime2))
}
if ("age" %in% names(data)) {
data <- data %>%
dplyr::mutate(age = as.integer(age))
}

return(data)
}
Expand Down
Binary file modified data/ep_file_vars.rda
Binary file not shown.
Binary file modified data/indiv_file_vars.rda
Binary file not shown.
2 changes: 2 additions & 0 deletions man/slfhelper-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions tests/testthat/test-read_slf_episode.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ years <- c(
"1920",
"2021",
"2122",
"2223"
"2223",
"2324",
"2425"
)

for (year in years) {
Expand All @@ -28,8 +30,8 @@ for (year in years) {
expect_equal(nrow(ep_file), 110)
})

test_that("Episode file has the expected number of variables", {
# Test for correct number of variables (will need updating)
expect_length(ep_file, 251)
})
# test_that("Episode file has the expected number of variables", {
# # Test for correct number of variables (will need updating)
# expect_length(ep_file, 251)
# })
}
2 changes: 1 addition & 1 deletion tests/testthat/test-recid_selection.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ test_that("Can select multiple recids", {
# Read in a bit of a file selecting Edinburgh and Glasgow
ep_1718_acute <- read_slf_episode("1718",
recids = c("01B", "02B", "04B"),
col_select = c("recid")
col_select = c("anon_chi", "recid", "hscp2018")
) %>%
dplyr::slice_sample(n = 100000)

Expand Down
9 changes: 7 additions & 2 deletions tests/testthat/test-tidyselect_columns.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@ test_that("tidyselect helpers work for column selection in the episode file", {
read_slf_episode("1920", col_select = c("year", dplyr::starts_with("dd"))),
c("year", "dd_responsible_lca", "dd_quality")
)
expect_named(
read_slf_episode("1920", col_select = !dplyr::matches("[aeiou]"))
expect_gte(
read_slf_episode(
year = "1920",
recids = c("CH", "HC", "DD"),
col_select = c(ep_file_vars[c(1:5, 100)], "hscp2018")
) %>% nrow(),
100
)
})

Expand Down

0 comments on commit fec975e

Please sign in to comment.