WA_Can_HCA_DtaMngmt_Backup.Rmd

---
title: "WA_Can_HCA_DtaMngmt"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

# Set working directory
setwd("C:/wa-cannabis-hca")

# Clear workspace 
rm(list = ls())

# Load required libraries 
library(survey)
library(epiR)
library(tidyverse)
library(foreign)
library(haven)
library(table1)
library(kableExtra)
library(gtsummary)
library(gt)
library(broom)

```


```{r}
#### Load in the WA-BRFSS Data ####

# WRITING THE TEMP FILE IS ONLY NECCESARY IF YOU DO NOT HAVE THE FILE IN CSV FORMAT

# Read in the data as dta
temp <- read_dta('data/MAS_2011_2020_v2.dta') %>%
    subset(year > 2014 & year < 2020) # Years of interest given variable avail. are 2015 - 2019

# Find the proportion of observations in each year
tc <- table(temp$year) %>%
  prop.table() # get the proportion of each year's observations 

# tc <- table(temp$sex) %>%
#   prop.table() # get the proportion of each year's observations 
# 
# # Creating new Race/Eth. variable
# temp$race[temp$"_mrace1" == 4 & temp$"_hispanc" != 1] <- 1 # Non-hispanic Asian
# temp$race[temp$"_mrace1" == 2 & temp$"_hispanc" != 1] <- 2 # Non-hispanic Black or African American
# temp$race[temp$"_hispanc" == 1] <- 3 # Hispanic/Latino/a
# temp$race[temp$"_mrace1" == 7 & temp$"_hispanc" != 1] <- 4 # Non-hispanic Multiracial
# temp$race[temp$"_mrace1" == 6 & temp$"_hispanc" != 1] <- 5 # Non-hispanic Native American
# temp$race[temp$"_mrace1" == 3 & temp$"_hispanc" != 1] <- 6 # Non-hispanic other race
# temp$race[temp$"_mrace1" == 5 & temp$"_hispanc" != 1] <- 7 # Non-hispanic Pacific Islander
# temp$race[temp$"_mrace1" == 1 & temp$"_hispanc" != 1] <- 8 # Non-hispanic White
# temp$race[is.na(temp$"_mrace1") & is.na(temp$"_hispanc")] <- NA
# 
# tr <- table(temp$race) %>%
#   prop.table()
# 
# ta <- table(temp$"_age_g") %>%
#   prop.table()

temp$'_llcpwt'[temp$year == 2015] <- temp$'_llcpwt'[temp$year == 2015] * tc[1] # Re-weighting 2015
temp$'_llcpwt'[temp$year == 2016] <- temp$'_llcpwt'[temp$year == 2016] * tc[2] # Re-weighting 2016
temp$'_llcpwt'[temp$year == 2017] <- temp$'_llcpwt'[temp$year == 2017] * tc[3] # Re-weighting 2017
temp$'_llcpwt'[temp$year == 2018] <- temp$'_llcpwt'[temp$year == 2018] * tc[4] # Re-weighting 2018
temp$'_llcpwt'[temp$year == 2019] <- temp$'_llcpwt'[temp$year == 2019] * tc[5] # Re-weighting 2019

# Re-write the file as a csv
write.csv(temp, 'data/WA_2015_2019_BRFSS.csv')

#Remove the dta file
rm(temp)

# Read in the newly created csv file for desired vars
full <- read.csv("data/WA_2015_2019_BRFSS.csv", stringsAsFactors = F) %>%
  select(year,
         zipcode,
         zipcode1,
         X_ststr,
         X_llcpwt,
         sex,
         X_age_g,
         X_mrace1,
         X_hispanc,
         educa,
         employ1,
         income_wa,
         X_smoker3,
         alcday5,
         X_ment14d,
         hlthpln1,
         medcost,
         mjpast30,
         mj30day
         ) %>%
  subset(X_age_g < 6) # Removing individuals over 65 due to Medicare eligibility 

# *Optional* Write to new file for memory efficiency 
write.csv(full, "data/WA_2015_2019_BRFSS_Sub.csv")
rm(list = ls())
df <- read.csv("data/WA_2015_2019_BRFSS_Sub.csv")
df$X <- NULL
# summary(df)

```

```{r}
#### Assigning NA Values ####

# Assigning the values of 7 or 9 "refusal" to NA within sex variable
df$sex[df$sex == 7 | df$sex == 9] <- NA

# Assigning the value of 9 to NA within the race variable
df$X_mrace1[df$X_mrace1 == 77 | df$X_mrace1 == 99] <- NA

# Assigning the value of 9 to NA within the Eth variable
df$X_hispanc[df$X_hispanc == 9] <- NA

# Assigning the value of 9 to NA within the edu. variable
df$educa[df$educa == 9] <- NA

# Assigning the value of 9 to NA within the employment variable
df$employ1[df$employ1 == 9] <- NA

# Assigning the value of 77 and 99 to NA within the income variable
df$income_wa[df$income_wa == 77 | df$income_wa == 99] <- NA

# Assigning the value of 9 to NA within the mental health variable
df$X_ment14d[df$X_ment14d == 9] <- NA

# Assigning the value of 7 and 9 to NA within the insurance variable
df$hlthpln1[df$hlthpln1 == 7 | df$hlthpln1 == 9] <- NA

# Assigning the value of 7 and 9 to NA within the medical cost variable
df$medcost[df$medcost == 7 | df$medcost == 9] <- NA

# Assigning the value of 7 and 9 to NA within the smoking variable
df$X_smoker3[df$X_smoker3 == 9] <- NA

# Assigning the value of 777 and 999 to NA within the alc. variable
df$alcday5[df$alcday5 == 777 | df$alcday5 == 999] <- NA

# Assigning the value of 77 and 99 to NA within the cannabis variable
df$mj30day[df$mj30day == 9 | df$mj30day == 7] <- NA

# Assigning the value of 77 and 99 to NA within the cannabis variable
df$mjpast30[df$mjpast30 == 77 | df$mjpast30 == 99] <- NA

# summary(df)

#  barplot(table(df$menthlth))
#  barplot(table(df$mjpast30))
#  
#  testdf <- subset(df, mjpast30 != 88)
#  barplot(table(testdf$mjpast30))
#  
# testdf <- subset(df, menthlth != 88)
# barplot(table(testdf$menthlth))

```

```{r}
#### Dropping All Observations Where MJ or HCA is NA ####

df <- df %>%
  filter(!is.na(mj30day)) %>% # filter out any missingness associated with exposure/outcome
  rename(zipcodeTemp1 = zipcode, zipcodeTemp2 = zipcode1) %>% # Rename zipcode variables
  mutate(zipcode = coalesce(zipcodeTemp1, zipcodeTemp2)) %>% # Combine the zipcode variables across the two columns
  select(-c(zipcodeTemp1, zipcodeTemp2)) %>% # Drop old zipcode variables
  select(zipcode, everything()) # Move updated variable to the beginning of the dataframe

df <- df %>%
  filter(!is.na(X_ment14d))

df <- df %>%
  filter(!is.na(medcost) )

df <- df %>%
  filter(!is.na(hlthpln1))


# summary(df)

# Creating variable for 14+ vs. 0-13 comparison
df$mentComp[df$X_ment14d == 3] <- 1 
df$mentComp[df$X_ment14d == 1 | df$X_ment14d == 2] <- 2

# Recoding tobacco usage variable
df$curSmk[df$X_smoker3 == 1 | df$X_smoker3 == 2] <- 1 # Values 1 and 2 correspond to current smoker
df$curSmk[df$X_smoker3 == 3 | df$X_smoker3 == 4] <- 2 # Either a former smoker or never smoked
df$curSmk[is.na(df$X_smoker3)] <- NA

# Recoding the alc usage variable
df$curAlc[df$alcday5 > 100 & df$alcday5 < 108] <- 1 # Consumed alcohol in the past week
df$curAlc[df$alcday5 > 200 & df$alcday5 < 231] <- 2 # Consumed alcohol in the past month
df$curAlc[df$alcday5 == 888] <- 3 # Consumed no alcohol in past 30 days
df$curAlc[is.na(df$alcday5)] <- NA

# Recoding current smoker + current alc usage
df$curAlcSmk[df$curSmk == 1 | df$curAlc == 1 | df$curAlc == 2] <- 1
df$curAlcSmk[df$curSmk == 2 & df$curAlc == 3] <- 2
df$curAlcSmk[is.na(df$curAlc) | is.na(df$curSmk)] <- NA

# Recoding the Education Variable
df$edu[df$educa < 4] <- 1 # Did not graduate high school
df$edu[df$educa == 4] <- 2 # Graduated high school
df$edu[df$educa == 5] <- 3 # Some college or tech school
df$edu[df$educa == 6] <- 4 # Graduated from college/tech school
df$edu[is.na(df$educa)] <- NA

# Creating new Race/Eth. variable
df$race[df$X_mrace1 == 4 & df$X_hispanc != 1] <- 1 # Non-hispanic Asian
df$race[df$X_mrace1 == 2 & df$X_hispanc != 1] <- 2 # Non-hispanic Black or African American
df$race[df$X_hispanc == 1] <- 3 # Hispanic/Latino/a
df$race[df$X_mrace1 == 7 & df$X_hispanc != 1] <- 4 # Non-hispanic Multiracial
df$race[df$X_mrace1 == 6 & df$X_hispanc != 1] <- 5 # Non-hispanic Native American
df$race[df$X_mrace1 == 3 & df$X_hispanc != 1] <- 6 # Non-hispanic other race
df$race[df$X_mrace1 == 5 & df$X_hispanc != 1] <- 7 # Non-hispanic Pacific Islander
df$race[df$X_mrace1 == 1 & df$X_hispanc != 1] <- 8 # Non-hispanic White
df$race[is.na(df$X_mrace1) & is.na(df$X_hispanc)] <- NA

# Creating new income variable
df$income_wa[df$income_wa <= 5] <- 1 # Income less than $25,000
df$income_wa[df$income_wa > 5 & df$income_wa < 8] <- 2 # Income between $25,000 - $75,000
df$income_wa[df$income_wa >= 8] <- 3 # Income >$75,000
df$income_wa[is.na(df$income_wa)] <- NA

# Creating difficulty accesssing care variable
df$dHCA[df$medcost == 1 | df$hlthpln1 == 2] <- 1 # difficulty accessing care
df$dHCA[df$medcost == 2 & df$hlthpln1 == 1] <- 2 # No difficulty accessing care
 
```

```{r}
#### Factoring Variables #### 

# Gender Factor
df$sex <- factor(df$sex,
                 levels = c(1,2),
                 labels = c("Male", "Female"),
                 exclude = NULL)

# Education Factor  CREATE NEW VARIABLE
df$edu <- factor(df$edu,
                   levels = c(1,2,3,4),
                   labels = c("Did not graduate high school",
                              "High school graduate",
                              "Some college or tech school",
                              "Graduated college or tech school"),
                   exclude = NULL)

# Race Factor
df$race <- factor(df$race,
                   levels = c(5,1,2,3,7,4,6,8),
                   labels = c("Non-Hispanic AIAN",
                              "Non-Hispanic Asian",
                              "Non-Hispanic Black or African American",
                              "Hispanic, Latino/a, or Spanish origin",
                              "Non-Hispanic NHPI",
                              "Non-Hispanic other race",
                              "Non-Hispanic two or more races",
                              "Non-Hispanic White"),
                   exclude = NULL)

# Age Factors
df$X_age_g <- factor(df$X_age_g, 
                       levels = c(1,2,3,4,5),
                       labels = c("18-24", "25-34",
                                  "35-44", "45-54",
                                  "55-64"),
                       exclude = NULL)

# Employment Factors
df$employ1 <- factor(df$employ1,
                     levels = c(1,2,3,4,5,6,7,8),
                     labels = c("Employed for wages", "Self-employed",
                                "Out of work for 1 year or more", "Out of work < 1 year",
                                "A homemaker", "A student", "Retired", "Unable to work"),
                     exclude = NULL)

# Income Factors CREATE NEW VARIABLE
df$income_wa <- factor(df$income_wa,
                     levels = c(1,2,3),
                     labels = c("< $35,000", "$35,000-$75,000", "> $75,000"),
                     exclude = NULL)

# Currant Smoker Factors
df$curSmk <- factor(df$curSmk,
                      levels = c(1,2),
                      labels = c("Current smoker", "Non-smoker"),
                      exclude = NULL)

# Current Alc. Factors
df$curAlc <- factor(df$curAlc,
                      levels = c(1,2,3),
                      labels = c("Within past week", "Within past month", "None in past month"),
                      exclude = NULL)

# Mental Health Factors
df$mentComp <- factor(df$mentComp,
                      levels = c(1,2),
                      labels = c("14+ days", "0-13 days"),
                      exclude = NULL)

# Health Insurance Factor
df$hlthpln1 <- factor(df$hlthpln1,
                      levels = c(2,1),
                      labels = c("Yes", "No"),
                      exclude = NULL)

# Med Cost Factor
df$medcost <- factor(df$medcost,
                      levels = c(1,2),
                      labels = c("Yes", "No"),
                      exclude = NULL) 

# Med Cost Factor
df$dHCA <- factor(df$dHCA,
                      levels = c(1,2),
                      labels = c("Yes", "No"),
                      exclude = NULL) 

# Cannabis Use Factor
df$mj30day <- factor(df$mj30day,
                      levels = c(1,2),
                      labels = c("Cannabis use in past month", "No cannabis use in past month"),
                      exclude = NULL)

```

```{r}
#### Creating Labels for each variable ####

# Creating age label
label(df$X_age_g) <- "Age group (yrs)"

# Creating gender label 
label(df$sex) <- "Gender"

# Creating race label
label(df$race) <- "Race/Ethnicity"

# Creating education label
label(df$edu) <- "Education"

# Creating income label 
label(df$income_wa) <- "Household Income"

# Creating health insurance label
label(df$hlthpln1) <- "No health coverage"

# Creating health cost label
label(df$medcost) <- "Delay medical care due to cost in past year"

# Creating mental health label
label(df$mentComp) <- "Poor mental health in past month"

# Creating alcohol consumption label
label(df$curAlc) <- "Alcohol consumption"

# Creating smoking status label
label(df$curSmk) <- "Tobacco smoking status"

# Creating cannabis usage label
label(df$mj30day) <- "Cannabis use in past month"

# Creating difficulty accessing care label
label(df$dHCA) <- "Difficulty accessing healthcare"

save(df, file = "data/WA_2015_2019_BRFSS_Clean.Rds")

```