-
Notifications
You must be signed in to change notification settings - Fork 0
/
PAClean.R
187 lines (140 loc) · 6.98 KB
/
PAClean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#### Center on Reinventing Public Education ####
# Description: Cleaning data obtained from the website of Philadelphia's Department of Education
# on Performance and Demographics/Enrollment
# Title: Cleaning Philadelphia
# Created by: Kevin Cha on 08-03-17
# Updated by: Kevin Cha on 08-08-17
# Data from:
# Performance: http://www.education.pa.gov/Data-and-Statistics/Pages/Keystone-Exams-Results.aspx#tab-1
# http://www.education.pa.gov/Data-and-Statistics/PSSA/Pages/default.aspx#tab-1
# Demographics: http://www.education.pa.gov/Data-and-Statistics/Pages/Enrollment%20Reports%20and%20Projections.aspx#tab-1
# Codebook:
# Demographics: inside the raw data files
# Performance:
# Link to Github: https://github.com/CRPE-UWB/State
# Notes: For Demographics, will not use anything before 2011-12 because those datasets are missing both gender and race
# For Performance, unable to obtain 2013-14+2014-15 because everytime it redirects me to this site: http://eseafedreport.com/
# Setup --------------------------------------------------------------------------------------------------------
rm(list=ls())
setwd("/Users/crpe/Documents/al_pa_clean") #MAC
library(plyr)
library(dplyr)
library(tidyr)
library(data.table)
library(stringr)
library(readr)
library(readxl)
# List --------------------------------------------------------------------------------------------------------
# Functions --------------------------------------------------------------------------------------------------------
no_more_special_characters <- function(df_col) {
# need to replace the chracters
require(gsubfn)
# make sure the col is character
df_col <- as.character(df_col)
# list of special characters to replace with
unwanted_array = list( 'Š'='S', 'š'='s', 'Ž'='Z', 'ž'='z', 'À'='A', 'Á'='A', 'Â'='A', 'Ã'='A', 'Ä'='A', 'Å'='A', 'Æ'='A', 'Ç'='C', 'È'='E', 'É'='E',
'Ê'='E', 'Ë'='E', 'Ì'='I', 'Í'='I', 'Î'='I', 'Ï'='I', 'Ñ'='N', 'Ò'='O', 'Ó'='O', 'Ô'='O', 'Õ'='O', 'Ö'='O', 'Ø'='O', 'Ù'='U',
'Ú'='U', 'Û'='U', 'Ü'='U', 'Ý'='Y', 'Þ'='B', 'ß'='Ss', 'à'='a', 'á'='a', 'â'='a', 'ã'='a', 'ä'='a', 'å'='a', 'æ'='a', 'ç'='c',
'è'='e', 'é'='e', 'ê'='e', 'ë'='e', 'ì'='i', 'í'='i', 'î'='i', 'ï'='i', 'ð'='o', 'ñ'='n', 'ò'='o', 'ó'='o', 'ô'='o', 'õ'='o',
'ö'='o', 'ø'='o', 'ù'='u', 'ú'='u', 'û'='u', 'ý'='y', 'ý'='y', 'þ'='b', 'ÿ'='y' )
# replaces the characters
df_col <- gsubfn(paste(names(unwanted_array),collapse='|'), unwanted_array,df_col)
return(df_col)
}
last_clean_demo <- function(df,year) {
# add YEAR column
df$YEAR <- year
# last cleaning
df <- as.data.frame(sapply(df, gsub, pattern=",", replacement=""))
df <- as.data.frame(sapply(df, gsub, pattern="~", replacement="-"))
df$LEA_NAME <- no_more_special_characters(df$LEA_NAME)
df$SCHOOL_NAME <- no_more_special_characters(df$SCHOOL_NAME)
df$COUNTY <- no_more_special_characters(df$COUNTY)
df[is.na(df)] <- -99
return(df)
}
# Read in Each Dataset --------------------------------------------------------------------------------------------------------
# Demographic --------------------------------------------------------------------------------------------------------
# 2016-17 -----
# datset: gender
demo_g_17 <- read_xlsx("data/Enrollment Public Schools 2016-17.xlsx", sheet = 4, skip = 4)
# select columns we only need
demo_g_17 <- demo_g_17 %>% select(c("AUN", "LEA Name", "LEA Type", "County", "School Number", "School Name", "Student Gender Code", "Total"))
# turn the * into NA
demo_g_17[demo_g_17 == "*"] <- NA
# create a total column
demo_g_17$TOTAL_ENROLL <- demo_g_17$Total
# spread the Student Gender Code
demo_g_17 <- demo_g_17 %>% spread("Student Gender Code", Total)
# get rid of the <NA> column
demo_g_17$`<NA>` <- NULL
# rename the columns
colnames(demo_g_17) <- c("AUN", "LEA_NAME", "LEA_TYPE", "COUNTY", "SCHOOL_NUM", "SCHOOL_NAME", "TOTAL", "FEMALE", "MALE")
# combine the rows together
demo_g_17 <- demo_g_17 %>%
group_by_("AUN", "LEA_NAME", "LEA_TYPE", "COUNTY", "SCHOOL_NUM", "SCHOOL_NAME") %>%
summarise(sum(TOTAL, na.rm = TRUE), sum(FEMALE, na.rm = TRUE), sum(MALE, na.rm = TRUE))
# rename the columns(again)
colnames(demo_g_17) <- c("AUN", "LEA_NAME", "LEA_TYPE", "COUNTY", "SCHOOL_NUM", "SCHOOL_NAME", "TOTAL", "FEMALE", "MALE")
# dataset: race
demo_r_17 <- read_xlsx("data/Enrollment Public Schools 2016-17.xlsx", sheet = 5, skip = 4)
# dataset: school
demo_s_17 <- read_xlsx("data/Enrollment Public Schools 2016-17.xlsx", sheet = 3, skip = 4)
# select columns we only need
demo_r_17 <- demo_r_17 %>% select(c("AUN", "LEA Name", "LEA Type", "County", "Race", "Total"))
demo_s_17 <- demo_s_17 %>% select(c("AUN", "LEA Name", "LEA Type", "County", "School Number", "School Name", "Total"))
# turn the * into NA
demo_r_17[demo_r_17 == "*"] <- NA
# spread the Student Gender Code
demo_r_17 <- demo_r_17 %>% spread("Race", Total)
# get rid of the <NA> column
demo_r_17$`<NA>` <- NULL
# combine the two
demo_r_17 <- left_join(demo_s_17, demo_r_17)
# rename the columns
colnames(demo_r_17) <- c("AUN", "LEA_NAME", "LEA_TYPE", "COUNTY", "AMINDIAN", "ASIAN", "BLACK", "HISPANIC", "TWOORMORE", "PACISLAND", "WHITE")
# make the ethnicity columns numeric
demo_r_17$AMINDIAN <- as.numeric(demo_r_17$AMINDIAN)
demo_r_17$ASIAN <- as.numeric(demo_r_17$ASIAN)
demo_r_17$HISPANIC <- as.numeric(demo_r_17$HISPANIC)
demo_r_17$TWOORMORE <- as.numeric(demo_r_17$TWOORMORE)
demo_r_17$PACISLAND <- as.numeric(demo_r_17$PACISLAND)
demo_r_17$WHITE <- as.numeric(demo_r_17$WHITE)
# combine them
demo_17 <- full_join(demo_g_17, demo_r_17)
# last cleaning
demo_17 <- last_clean(demo_17)
# 2015-16 -----
# read in .xlsx
demo_16 <- read_xlsx("data/Enrollment Public Schools 2015-16.xlsx")
# 2014-15 -----
# read in .xlsx
demo_15 <- read_xlsx("data/Enrollment Public Schools 2014-15.xlsx")
# 2013-14 -----
# read in .xlsx
demo_14 <- read_xlsx("data/Enrollment Public Schools 2013-14.xlsx")
# 2012-13 -----
# read in .xlsx
demo_13 <- read_xlsx("data/Enrollment Public Schools 2012-13.xlsx")
# 2011-12 -----
# read in .xlsx
demo_12 <- read_xlsx("data/Enrollment Public Schools 2011-12.xlsx")
# Finish -----
# Save as .csv file
write.csv(data_demo, "cleaned_data/pa_enroll_2010_17", row.names = FALSE)
# Performance --------------------------------------------------------------------------------------------------------
# 2015-16 -----
# 2014-15 -----
# 2013-14 -----
# 2012-13 -----
# 2011-12 -----
# 2010-11 -----
# Finish -----
# Save as .csv file
write.csv(data_perf, "cleaned_data/pa_perf_20_1", row.names = FALSE)
# Final Finish --------------------------------------------------------------------------------------------------------
# Save as .RData file
save.image(file="cleaned_data/pa_perf_enroll_clean.Rdata")
# Save as .RDS
saveRDS(data_demo, file="cleaned_data/pa_enroll_clean.rds")
saveRDS(data_perf, file="cleaned_data/pa_perf_clean.rds")