Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve docs for data_to_wide #506

Merged
merged 30 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ CHANGES
If you recode into a numeric variable, and one of the recode values is `NA`,
you no longer need to use `NA_real_` for numeric `NA` values.

* Improved documentation for some functions.

BUG FIXES

* `data_to_long()` did not work for data frame where columns had attributes
Expand Down
39 changes: 21 additions & 18 deletions R/data_read.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
#' factors, where imported value labels will be set as factor levels. If a
#' numeric variable has _no_ value labels or less value labels than values, it
#' is not converted to factor. In this case, value labels are preserved as
#' `"labels"` attribute. Character vectors are preserved. Use
#' `"labels"` attribute. Character vectors are preserved. Use
#' `convert_factors = FALSE` to remove the automatic conversion of numeric
#' variables to factors.
#'
Expand Down Expand Up @@ -105,7 +105,7 @@
por = .read_spss(path, encoding, convert_factors, verbose, ...),
dta = .read_stata(path, encoding, convert_factors, verbose, ...),
sas7bdat = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...),
.read_unknown(path, convert_factors, verbose, ...)
.read_unknown(path, file_type, convert_factors, verbose, ...)
)

# tell user about empty columns
Expand Down Expand Up @@ -161,7 +161,7 @@
# user may decide whether we automatically detect variable type or not
if (isTRUE(convert_factors)) {
if (verbose) {
msg <- "Variables where all values have associated labels are now converted into factors. If this is not intended, use `convert_factors = FALSE`."

Check warning on line 164 in R/data_read.R

View workflow job for this annotation

GitHub Actions / lint / lint

file=R/data_read.R,line=164,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 152 characters.

Check warning on line 164 in R/data_read.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=R/data_read.R,line=164,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 152 characters.
insight::format_alert(msg)
}
x[] <- lapply(x, function(i) {
Expand All @@ -178,20 +178,18 @@
if (is.character(i)) {
# we need this to drop haven-specific class attributes
i <- as.character(i)
} else {
} else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
# if all values are labelled, we assume factor. Use labels as levels
if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
if (is.numeric(i)) {
i <- factor(i, labels = names(value_labels))
} else {
i <- factor(as.character(i), labels = names(value_labels))
}
value_labels <- NULL
attr(i, "converted_to_factor") <- TRUE
if (is.numeric(i)) {
i <- factor(i, labels = names(value_labels))
} else {
# else, fall back to numeric
i <- as.numeric(i)
i <- factor(as.character(i), labels = names(value_labels))
}
value_labels <- NULL
attr(i, "converted_to_factor") <- TRUE
} else {
# else, fall back to numeric
i <- as.numeric(i)
}

# drop unused value labels
Expand Down Expand Up @@ -290,12 +288,18 @@
}


.read_unknown <- function(path, convert_factors, verbose, ...) {
insight::check_if_installed("rio", reason = paste0("to read files of type '", .file_ext(path), "'"))
.read_unknown <- function(path, file_type, convert_factors, verbose, ...) {
insight::check_if_installed("rio", reason = paste0("to read files of type '", file_type, "'"))
if (verbose) {
insight::format_alert("Reading data...")
}
out <- rio::import(file = path, ...)
# set up arguments. for RDS, we set trust = TRUE, to avoid warnings
rio_args <- list(file = path)
# check if we have RDS, and if so, add trust = TRUE
if (file_type == "rds") {
rio_args$trust <- TRUE
}
out <- do.call(rio::import, c(rio_args, list(...)))

# for "unknown" data formats (like .RDS), which still can be imported via
# "rio::import()", we must check whether we actually have a data frame or
Expand All @@ -310,9 +314,8 @@
)
}
return(out)
} else {
out <- tmp
}
out <- tmp
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
}

.post_process_imported_data(out, convert_factors, verbose)
Expand Down
1 change: 1 addition & 0 deletions R/data_restoretype.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' Restore the type of columns according to a reference data frame
#'
#' @param data A data frame for which to restore the column types.
#' @inheritParams data_to_long
#' @inheritParams data_rename
#' @param reference A reference data frame from which to find the correct
Expand Down
105 changes: 82 additions & 23 deletions R/data_to_long.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,65 +4,124 @@
#' the number of columns. This is a dependency-free base-R equivalent of
#' `tidyr::pivot_longer()`.
#'
#' @param data A data frame to pivot.
#' @param names_to The name of the new column that will contain the column
#' names.
#' @param data A data frame to convert to long format, so that it has more
#' rows and fewer columns after the operation.
#' @param names_to The name of the new column (variable) that will contain the
#' _names_ from columns in `select` as values, to identify the source of the
#' values. `names_to` can be a character vector with more than one column name,
#' in which case `names_sep` or `names_pattern` must be provided in order to
#' identify which parts of the column names go into newly created columns.
#' See also 'Examples'.
#' @param names_prefix A regular expression used to remove matching text from
#' the start of each variable name.
#' @param names_sep,names_pattern If `names_to` contains multiple values, this
#' argument controls how the column name is broken up.
#' `names_pattern` takes a regular expression containing matching groups, i.e. "()".
#' @param values_to The name of the new column that will contain the values of
#' the pivoted variables.
#' argument controls how the column name is broken up. `names_pattern` takes a
#' regular expression containing matching groups, i.e. "()".
#' @param values_to The name of the new column that will contain the _values_ of
#' the columns in `select`.
#' @param values_drop_na If `TRUE`, will drop rows that contain only `NA` in the
#' `values_to` column. This effectively converts explicit missing values to
#' implicit missing values, and should generally be used only when missing values
#' in data were created by its structure.
#' `values_to` column. This effectively converts explicit missing values to
#' implicit missing values, and should generally be used only when missing values
#' in data were created by its structure.
#' @param rows_to The name of the column that will contain the row names or row
#' numbers from the original data. If `NULL`, will be removed.
#' numbers from the original data. If `NULL`, will be removed.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#' @param cols Identical to `select`. This argument is here to ensure compatibility
#' with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols`
#' is used.
#' with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols`
#' is used.
#'
#' @details
#' Reshaping data into long format usually means that the input data frame is
#' in _wide_ format, where multiple measurements taken on the same subject are
#' stored in multiple columns (variables). The long format stores the same
#' information in a single column, with each measurement per subject stored in
#' a separate row. The values of all variables that are not in `select` will
#' be repeated.
#'
#' The necessary information for `data_to_long()` is:
#'
#' - The columns that contain the repeated measurements (`select`).
#' - The name of the newly created column that will contain the names of the
#' columns in `select` (`names_to`), to identify the source of the values.
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
#' `names_to` can also be a character vector with more than one column name,
#' in which case `names_sep` or `names_pattern` must be provided to specify
#' which parts of the column names go into the newly created columns.
#' - The name of the newly created column that contains the values of the
#' columns in `select` (`values_to`).
#'
#' In other words: repeated measurements that are spread across several columns
#' will be gathered into a single column (`values_to`), with the original column
#' names, that identify the source of the gathered values, stored in one or more
#' new columns (`names_to`).
#'
#' @return If a tibble was provided as input, `reshape_longer()` also returns a
#' tibble. Otherwise, it returns a data frame.
#'
#' @examplesIf requireNamespace("psych") && requireNamespace("tidyr")
#' wide_data <- data.frame(replicate(5, rnorm(10)))
#' wide_data <- setNames(
#' data.frame(replicate(2, rnorm(8))),
#' c("Time1", "Time2")
#' )
#' wide_data$ID <- 1:8
#' wide_data
#'
#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:5))
#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:3))
#' # probably doesn't make much sense to mix "time" and "id"
#' data_to_long(wide_data)
#'
#' # Customizing the names
#' data_to_long(wide_data,
#' select = c(1, 2),
#' names_to = "Column",
#' values_to = "Numbers",
#' rows_to = "Row"
#' data_to_long(
#' wide_data,
#' select = c("Time1", "Time2"),
#' names_to = "Timepoint",
#' values_to = "Score"
#' )
#'
#' # Reshape multiple columns into long format.
#' mydat <- data.frame(
#' age = c(20, 30, 40),
#' sex = c("Female", "Male", "Male"),
#' score_t1 = c(30, 35, 32),
#' score_t2 = c(33, 34, 37),
#' score_t3 = c(36, 35, 38),
#' speed_t1 = c(2, 3, 1),
#' speed_t2 = c(3, 4, 5),
#' speed_t3 = c(1, 8, 6)
#' )
#' # The column names are split into two columns: "type" and "time". The
#' # pattern for splitting column names is provided in `names_pattern`. Values
#' # of all "score_*" and "speed_*" columns are gathered into a single column
#' # named "count".
#' data_to_long(
#' mydat,
#' select = 3:8,
#' names_to = c("type", "time"),
#' names_pattern = "(score|speed)_t(\\d+)",
#' values_to = "count"
#' )
#'
#' # Full example
#' # ------------------
#' data <- psych::bfi # Wide format with one row per participant's personality test
#'
#' # Pivot long format
#' data_to_long(data,
#' very_long_data <- data_to_long(data,
#' select = regex("\\d"), # Select all columns that contain a digit
#' names_to = "Item",
#' values_to = "Score",
#' rows_to = "Participant"
#' )
#' head(very_long_data)
#'
#' data_to_long(
#' even_longer_data <- data_to_long(
#' tidyr::who,
#' select = new_sp_m014:newrel_f65,
#' names_to = c("diagnosis", "gender", "age"),
#' names_pattern = "new_?(.*)_(.)(.*)",
#' values_to = "count"
#' )
#'
#' head(even_longer_data)
#' @inherit data_rename
#' @export
data_to_long <- function(data,
Expand Down
Loading
Loading