Brad's review of data_06_self_report_import.qmd

Part of #33 - Removed the tidyselect code being used to select columns inside of across() for recoding factors. Now, we use explicit column names instead so that the code is easier to reason about. - Started using the functions in recoding_factoring_relocating.R and nums_to_na.R to clean and transform categorical variables. - Changed coding for all "Yes/No" columns from "1/2" to "1/0". - Spot check the factor code. - Finish the recode_factor_relocate function (totally optional). - Use the `here` package to facilitate file import and export. - Made headings more consistent. - Checked for overlap with qaqc/data_01_self_report_recode_factors.Rmd. After a review, I concluded that we are safe to delete the QAQC file.
brad-cannell · Apr 11, 2024 · 5fd9f9b · 5fd9f9b
1 parent 5a6e323
commit 5fd9f9b
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 822 deletions.
diff --git a/data_management/data_06_self-report_import.qmd b/data_management/data_06_self-report_import.qmd
@@ -38,6 +38,10 @@ self_rep <- readr::read_csv(self_rep_path)
 rm(self_rep_path)
 ```
 
+## Data check
+
+The initial number of rows and columns.
+
 ```{r}
 # Data check
 dims <- c(956, 209)
@@ -1437,71 +1441,86 @@ self_rep <- self_rep |>
       all_of(cols),
       function(x) {
         encodings <- Encoding(x)
-        utf_8_idex <- which(encodings == "UTF-8")
+        utf_8_index <- which(encodings == "UTF-8")
         Encoding(self_rep$fear_afraid_response[utf_8_index]) <- "unknown"
       }
     )
   )
 ```
 
-The columns above do not represent categories. Therefore, there is no need to coerce them to factors.
+- The columns above do not represent categories. Therefore, there is no need to coerce them to factors.
+- The section above does not change the number of rows or columns, so there is no need for another data check here.
 
 ## Clean environment variables
 
-# 🔴 Ebie's code
-
-#### Recode categorical variables to factor variables
-
 ```{r}
-self_rep <- self_rep %>%
+self_rep <- self_rep |>
   mutate(
-    report_private_5cat = case_when(
+    # Convert character values to numeric values
+    report_private = case_when(
       report_private == "Yes, for the entire time" ~ 1,
       report_private == "Yes, for part of the time" ~ 2,
       report_private == "No, but they were still close enough to hear" ~ 3,
       report_private == "No" ~ 4,
       report_private == "Don't know" ~ 7
     ),
-    report_private_5cat_f = factor(
-      report_private,
-      levels = c(
-        "Yes, for the entire time", "Yes, for part of the time",
-        "No, but they were still close enough to hear", "No",
-        "Don't know"
-      )
-    ),
-    report_influence_4cat = case_when(
+    report_influence = case_when(
       report_influence == "Yes" ~ 1,
-      report_influence == "No" ~ 2,
+      report_influence == "No" ~ 0,
       report_influence == "Don't know" ~ 7,
       report_influence == "Refused" ~ 9
     ),
-    report_influence_4cat_f = factor(
-      report_influence,
-      levels = levels_yes_no
+    
+    # Create a version of each column with "Don't know" and "Refused" converted to NA
+    report_private_4cat = case_when(
+      report_private == 7 ~ NA_real_,
+      TRUE ~ report_private
+    ),
+    report_influence_2cat = case_when(
+      report_influence == 7 ~ NA_real_,
+      TRUE ~ report_influence
     )
-  )
+  ) |> 
+  
+  # Relocate new columns with NA values immediately after the original column
+  relocate(report_private_4cat, .after = report_private) |> 
+  relocate(report_influence_2cat, .after = report_influence) |> 
+  
+  # Create a factor version of each column (w/o "Don't know" and "Refused")
+  mutate(
+    report_private_4cat_f = factor(
+      report_private_4cat,
+      1:4,
+      c(
+        "Yes, for the entire time", "Yes, for part of the time",
+        "No, but they were still close enough to hear", "No"
+      )
+    ),
+    report_influence_2cat_f = factor(
+      report_influence_2cat,
+      0:1,
+      c("No", "Yes")
+    )
+  ) |> 
+  
+  # Relocate factor columns immediately after the new columns with NA values
+  relocate(report_private_4cat_f, .after = report_private_4cat) |> 
+  relocate(report_influence_2cat_f, .after = report_influence_2cat)
 ```
 
-## Remove raw character variables that have recoded forms
+### Data check
 
-```{r}
-self_rep <- self_rep %>% select(
-  -c(
-    (starts_with("emotional") & (!matches("[0-9]cat") & !matches("_age") & !matches("_times_times"))),
-    (starts_with("fear") & (!matches("[0-9]cat") & !matches("_response$"))),
-    (starts_with("finance") & (!matches("[0-9]cat") & !matches("_age") & !matches("_times_times"))),
-    (starts_with("neglect") & (!matches("[0-9]cat") & !matches("_age") & !matches("_times_times"))),
-    (starts_with("physical") & (!matches("[0-9]cat") & !matches("_age") & !matches("_times_times"))),
-    (starts_with("report") & !matches("[0-9]cat")),
-    (starts_with("sexual") & (!matches("[0-9]cat") & !matches("_age") & !matches("_times_times")))
-  )
-)
-```
+Number of columns after cleaning the environment variables section.
 
 ```{r}
 # Data check
-dim(self_rep) # 955 351
+ncols <- 574
+if (ncol(self_rep) != ncols) {
+  stop(
+    ncols, " columns expected in self_ref after cleaning the environment variables. ", 
+    ncol(self_rep), " columns found. Check into it."
+  )
+}
 ```