-
Notifications
You must be signed in to change notification settings - Fork 0
/
imputation.R
100 lines (82 loc) · 3.48 KB
/
imputation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#####################################
## Using missForest for Imputation ##
##### code created by Val Ryan ######
##### last updated July 29, 2020 ####
#####################################
#############################################
## Read and Recode Baseline Attribute Data ##
#############################################
#set working directory
setwd("C:/Users/Your File Path")
#read in csv files
baseline <- read.csv("nodesfinal.csv", header=T)
#recode variables
attach(baseline)
#Education
##0 = elementary, 1 = high school, 2 = college
baseline$edcat[b03_1.x <= 2] <- 0
baseline$edcat[b03_1.x == 3] <- 1
baseline$edcat[b03_1.x == 4] <- 1
baseline$edcat[b03_1.x > 4] <- 2
#Employment
##0 = employed, 1 = unemployed, 2 = no work medical reason, 3 = other
baseline$employcat[b04.x <= 7] <- 0
baseline$employcat[b04.x == 8] <- 1
baseline$employcat[b04.x == 9] <- 2
baseline$employcat[b04.x == 10] <- 3
#Living Situation
##0 = rent, 1 = no rent, 2 = homeless, 3 = missing
baseline$livecat[b07.x == 1] <- 0
baseline$livecat[b07.x == 3] <- 0
baseline$livecat[b07.x == 4] <- 0
baseline$livecat[b07.x == 2] <- 1
baseline$livecat[b07.x == 5] <- 1
baseline$livecat[b07.x == 6] <- 2
baseline$livecat[b07.x == 7] <- NA
baseline$livecat[b07.x == 97] <- NA
#Nationality
##0 = Greek, 1 = not Greek
baseline$natcat[b05_1.x == 1] <- 0
baseline$natcat[b05_1.x == 2] <- 1
baseline$natcat[b05_1.x == 3] <- 1
baseline$natcat[b05_1.x == 4] <- 1
baseline$natcat[b05_1.x == 5] <- 1
baseline$natcat[b05_1.x == 7] <- 1
baseline$natcat[b05_1.x == 8] <- 1
baseline$natcat[b05_1.x == 9] <- 1
baseline$natcat[b05_1.x == 13] <- 1
baseline$natcat[b05_1.x == 14] <- 1
baseline$natcat[b05_1.x == 15] <- 1
detach(baseline)
###################################################################
## Impute Values for 8 People with Missing Living Situation Data ##
###################################################################
#make sure missing values that you want to impute are set to NA for this analysis
#make a reduced dataset that only includes variables you'll be using to impute data
#otherwise this could take a long time to run, if you have a lot of variables
#I've included the variables I just recategorized, an ID variable, and age, which is continuous
data <- subset(baseline, select=c(egoid, employcat, natcat,
edcat, age.x, livecat, gender.x))
#impute data using missForest package
library(missForest)
#cutoff statement is a list of cutoffs for each variable
#set to 1 for continuous variables
#specify possible values for categorical variables
baseline.imp <- missForest(xmis = data,
maxiter = 10, ntree = 100,
replace = TRUE,
cutoff = list(1, c(0,1,2,3), c(0,1), c(0,1,2), 1,
c(0,1,2), c(0,1)),
xtrue = NA)
#checking the imputed values
#ximp is the imputed data matrix
table(baseline.imp$ximp$livecat)
#make cut-offs for imputed data, which is numeric, to add to categories
data$livecategory[baseline.imp$ximp$livecat < 0.5] <- 0
data$livecategory[baseline.imp$ximp$livecat > 0.5 & baseline.imp$ximp$livecat <= 1.5] <- 1
data$livecategory[baseline.imp$ximp$livecat > 1.5] <- 2
#check to make sure categorization worked
table(data$livecategory, baseline.imp$ximp$livecat)
#remove old livecat variable from the dataset before analysis
data <- subset(data, select=c(egoid, employcat, natcat, edcat,
age.x, livecategory, gender.x))