-
Notifications
You must be signed in to change notification settings - Fork 0
/
Part2_Handling_demographics_data.R
99 lines (61 loc) · 3.76 KB
/
Part2_Handling_demographics_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#PREPARING THE TABLE FOR DEMOGRAPHICS
#install.packages('haven')
#install.packages('lubridate')
#install.packages("readxl")
# LIBRARIES
library("readxl")
library('haven')
library("dplyr")
library(tidyr)
library('lubridate')
#1. Read SAS dataset of demogrpahics----
setwd("C:/Users/flucerofernandez/Documents/IESEG S01/R/GP/...")
Demographics <- read_sas("./RawDataIDemographics.sas7bdat")
#2. Change RegDate and other important dates into a R Date format, represented as double in order to make Calcs.----
Demographics$RegDate <- as.Date(Demographics$RegDate, format = "%Y-%m-%d")
Demographics$FirstPay <- as.Date(Demographics$FirstPay, format = "%Y%m%d")
Demographics$FirstAct <- as.Date(Demographics$FirstAct, format = "%Y%m%d")
Demographics$FirstSp <- as.Date(Demographics$FirstSp, format = "%Y%m%d")
Demographics$FirstCa <- as.Date(Demographics$FirstCa, format = "%Y%m%d")
Demographics$FirstGa <- as.Date(Demographics$FirstGa, format = "%Y%m%d")
Demographics$FirstPo <- as.Date(Demographics$FirstPo, format = "%Y%m%d")
#in order to merge without issues, we will leave data types as they are, for now...----
#(changed just after merging everything together)
#3. Read appendices with readxl ----
appendix1 <- read_excel("./Appendices Group Assignment.xlsx", sheet = "Appendix 1")
appendix2 <- read_excel("./Appendices Group Assignment.xlsx", sheet = "Appendix 2")
appendix3 <- read_excel("./Appendices Group Assignment.xlsx", sheet = "Appendix 3")
appendix4 <- read_excel("./Appendices Group Assignment.xlsx", sheet = "Appendix 4")
#4. Merge dataframes of appendices----
First_merge <- merge(Demographics, appendix2, by.x = "Country",
by.y = "Country", all.x = TRUE, all.y = FALSE)
Second_merge <-merge(First_merge, appendix3, by.x = "Language",
by.y = "Language", all.x = TRUE, all.y = FALSE)
Demographics_With_Merges <- merge(Second_merge, appendix4, by.x = "ApplicationID",
by.y = "ApplicationID", all.x = TRUE, all.y = FALSE)
#5. Create 'StartingAfter' refering to the days between registration and first bet of any kind----
Demographics_With_Merges$StartingAfter <- difftime(Demographics_With_Merges$FirstAct,
Demographics_With_Merges$RegDate, units='days')
#6. Filter by conditions: Opened an account between Feb 1, 2005 and Feb 27, 2005 ----
Demographics_filter1 <- Demographics_With_Merges[(Demographics_With_Merges$RegDate> "2005-01-31" &
Demographics_With_Merges$RegDate < "2005-02-28"),]
#7. Filter by conditions: Not missing values in First Act ----
Demographics_filter2 <- Demographics_filter1 %>% filter(!is.na(FirstAct))
#8. replace NA in gender with mode of the category ----
# 8.1. Create the function for mode.
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# 8.2. replace na with the mode function for gender
Demographics_filter2$Gender <- replace_na(Demographics_filter2$Gender, getmode(Demographics_filter2$Gender))
#8.3. check
print(sum(is.na(Demographics_filter2$FirstAct)))
#9. Re-order the columns (numeric to right, strings in the left)----
Demographics_arranged <- Demographics_filter2[,c(4,5,6,7,8,9,10,11,16,12,1,15,2,14,3,13)]
#10. Sort by Registration Date ----
Demographics_cleaned <- Demographics_arranged[order(Demographics_arranged$RegDate,Demographics_arranged$UserID ),]
#11. Final table (number of rows correspond to the total number of observations of the final basetable) ----
summary(Demographics_cleaned)
#12. Save the file in csv (easier to handle when merging everything together.) ----
write.csv(Demographics_cleaned,'Demographics_by_User.csv')