-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01a_viol-match-all-cases.R
executable file
·230 lines (188 loc) · 9.17 KB
/
01a_viol-match-all-cases.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# Header ------------------------------------------------------------------
#
# PROJECT: Implied Warranty of Habitability
# AUTHOR: Mili Chapado
# DATE: 2018-06-25
# PURPOSE: Perform matching of all cases to violations data.
# DETAILS: Start with raw data on all (geocoded) cases and match all cases to
# violations and export this dataset for refernce. Then identify the set of
# cases with at least 1 open B or C violation open at filing. In
# "02_case-random-samles.R" we will draw a sample of 507 cases from this set to
# pull casse files and do analysis. We also will take the full set of cases and
# aggregate first by case then overall to get stats about presence of violations
# in the full universe of cases.
# Setup -------------------------------------------------------------------
library(tidyverse) # dplyr, readr, stringr, etc.
library(lubridate) # working with dates (eg. as_date)
iwh_dir <- ""
# Load Cases and Violations -----------------------------------------------
allcases <- str_glue("{iwh_dir}/Data/Raw/Cases/all_2016_cases.csv") %>%
read_csv(col_types = cols(bbl = "c"))
viol <- str_glue("{iwh_dir}/Data/Clean/all_violations.csv") %>%
read_csv(col_types = cols(bbl = "c"))
# Matching all cases to all 2016 HPD violations --------------------------
# There are a few cases that didn't geocode properly but were included in an
# initial sample, so for consistency these are added new via manual look ups and
# hardcoding.
fixed_cases <- tribble(
~formattedindexnumber, ~fixed_bbl, ~fixed_std_addr,
"LT-051279-16/QU", "4008930030", "2017 19TH ST APT 3A",
"LT-051913-16/KI", "3017107501", "1825 ATLANTIC AVE APT 3T",
"LT-052542-16/RI", "5023700234", "7 BOGOTA ST APT BSMT",
"LT-064210-16/BX", "2026910122", "1022 REV J POLITE AVE APT 5P",
"LT-801768-16/BX", "2049050001", "1189 229TH DR NORTH APT 2C"
)
geocoded_cases <- allcases %>%
# hardcode geocoding fixes for 5 cases
left_join(fixed_cases, by = "formattedindexnumber") %>%
mutate(
bbl = coalesce(fixed_bbl, bbl),
std_addr = coalesce(fixed_std_addr, std_addr)
) %>%
select(-starts_with("fixed_")) %>%
# separate addresses from apartments into two columns
extract("std_addr", c("std_addr", "std_apt"), "(.*)(APT.+)") %>%
# remove "APT", keep only unit number
mutate(std_apt = str_replace(std_apt, "APT ", "")) %>%
# drop cases without standard address or bbl - 84k cases remain
drop_na(std_addr, bbl)
# case info columns
case_cols <- c("formattedindexnumber", "court", "propertytype", "classification", "causeofaction")
geocoded_case_info <- geocoded_cases %>% select(one_of(case_cols))
# There are 84,600 cases with full address, unit, BBL info
cases_viols <- geocoded_cases %>%
# match 2016 violations to all cases by bbl [1000 cases did not have bbls]
inner_join(viol, by = "bbl") %>%
# match open violations on bbls to specific units ("cases")
mutate(unitmatch = if_else(std_apt == apartment, 1L, 0L)) %>%
filter(unitmatch == 1L) %>%
# drop extra columns
select(-unitmatch, -propertyaddress, -apartment) %>%
# fix date formatting
mutate(
ans_date = ymd(ans_date),
file_date = ymd(file_date)
) %>%
# categorize violations by case filed date and violation open window
mutate(
# Category 1 (violations open at time of case filing):
# NOV (anytime) before filing date AND cert (anytime) after filing date
cat1 = ((nov_issued_dt) < (file_date)) & ((orig_cert_by_dt) > (file_date)),
# Category 2 (violations open within one year prior to case filing):
# NOV (anytime) before filing date AND cert (within one year) before filing date
cat2 = ((nov_issued_dt) < (file_date)) & ((orig_cert_by_dt) < (file_date)) & ((orig_cert_by_dt) > (file_date - 365)),
# Category 3 (violations open within one year after case filing):
# NOV (within one year) after filing date AND cert (anytime) after filing date
cat3 = ((nov_issued_dt) > (file_date)) & ((orig_cert_by_dt) > (file_date)) & ((nov_issued_dt) < (file_date + 365)),
days_open = difftime(orig_cert_by_dt, nov_issued_dt, units = "days")
) %>%
# join back with all cases to we have cases that don't have open violations too
right_join(geocoded_case_info, by = case_cols)
# data set of matched cases to violations
write_csv(cases_viols, str_glue("{iwh_dir}/Data/Clean/All Cases/all_cases_violations_detailed.csv"), na = "")
# Violation info ----------------------------------------------------------
cases_viols %>%
group_by(formattedindexnumber) %>%
summarise(any_viols = sum(!is.na(violation_id)) > 0) %>%
count(any_viols)
# 84,600 total cases, 10,077 with violations, 74,471 without
# Cases with Violations open at Filing ------------------------------------
cases_bc_viol_open_at_filing <- cases_viols %>%
# identify violation types
mutate(
is_a_viol = violation_class %in% c("A"),
is_b_viol = violation_class %in% c("B"),
is_c_viol = violation_class %in% c("C")
) %>%
# only cases/rows that had an open violation at time of file date (cat1=TRUE)
filter(cat1 == "TRUE") %>%
# remove other time category columns and rename cat1
select(-cat2, -cat3) %>%
rename(open_on_fd = cat1) %>%
# add column, n of days open from NOV to FD
mutate(days_open_upto_fd = difftime(file_date, nov_issued_dt, units = "days")) %>%
# collapse by summarizing, stats into one row per case ID this data set
# contains just case ID and violation type info
group_by(formattedindexnumber) %>%
summarise(
n_a_viols = sum(is_a_viol, na.rm = TRUE),
n_c_viols = sum(is_b_viol, na.rm = TRUE),
n_b_viols = sum(is_c_viol, na.rm = TRUE),
avg_days_open = mean(days_open, na.rm = TRUE),
avg_days_open_upto_fd = mean(days_open_upto_fd, na.rm = TRUE)
) %>%
# only keep cases with at least 1 B or C violation open at filing
filter(n_b_viols > 0 | n_c_viols > 0)
# Data set of unique cases (dropped bbls and non-std addresses) matched to open
# B or C violations at time of filing date. These will be using in
# 02_case-rando-samples.R to draw sample of cases with open violations.
write_csv(cases_bc_viol_open_at_filing, str_glue("{iwh_dir}/Data/Clean/All Cases/all_cases_bc_viol_open_at_filing.csv"), na = "")
# Violations Descriptives ------------------------------------------------
# aggregate, total num of cases, total num of violations at fd, share of violations
# filter cases_viols, only cases with violations for analysis below
# run this before viol_stats to get only_viols info that will be used next
only_viols <- cases_viols %>%
drop_na(violation_class) %>%
mutate(
is_a_viol = violation_class %in% c("A"),
is_b_viol = violation_class %in% c("B"),
is_c_viol = violation_class %in% c("C"),
days_open_upto_fd = difftime(file_date, nov_issued_dt, units = "days")
) %>%
group_by(formattedindexnumber) %>%
summarise(
n_a_viols = sum(is_a_viol, na.rm = TRUE),
n_c_viols = sum(is_b_viol, na.rm = TRUE),
n_b_viols = sum(is_c_viol, na.rm = TRUE),
total_viols = sum(is_a_viol, is_b_viol, is_c_viol, na.rm = TRUE),
avg_days_open = mean(days_open, na.rm = TRUE),
avg_days_open_upto_fd = mean(days_open_upto_fd, na.rm = TRUE)
)
# distribution of violations among cases with at least 1 a/b/c violation
ggplot(only_viols, aes(x = total_viols)) +
geom_histogram(binwidth = 5) +
theme_classic() +
labs(
title = "Distribution of Violations",
x = "violations",
y = "cases"
)
ggsave("all_cases_viol_distribution.png", width = 2 , height = 2, units = "in")
# Start with all cases (with/without viols), aggregate by case, then futher
# aggregate over all cases.
all_cases_viol_stats <- cases_viols %>%
# identify violation types
mutate(
is_a_viol = violation_class %in% c("A"),
is_b_viol = violation_class %in% c("B"),
is_c_viol = violation_class %in% c("C"),
days_open_upto_fd = difftime(file_date, nov_issued_dt, units = "days")
) %>%
group_by(formattedindexnumber) %>%
summarise(
n_a_viols = sum(is_a_viol, na.rm = TRUE),
n_c_viols = sum(is_b_viol, na.rm = TRUE),
n_b_viols = sum(is_c_viol, na.rm = TRUE),
avg_days_open = mean(days_open, na.rm = TRUE),
avg_days_open_upto_fd = mean(days_open_upto_fd, na.rm = TRUE)
) %>%
ungroup() %>%
summarise(
n_cases = n(),
n_cases_with_viol = sum(n_a_viols | n_b_viols | n_c_viols),
n_with_a = sum(n_a_viols > 0),
pct_with_a = mean(n_a_viols > 0),
n_with_b = sum(n_b_viols > 0),
pct_with_b = mean(n_b_viols > 0),
n_with_c = sum(n_c_viols > 0),
pct_with_c = mean(n_c_viols > 0),
avg_days_open = mean(avg_days_open, na.rm = TRUE),
avg_days_open_upto_fd = mean(avg_days_open_upto_fd, na.rm = TRUE),
# among cases with at least 1 violation what's the average number of violations
# use only_viols from above
avg_viol_per_caseviol = mean(only_viols$total_viols),
avg_a_viol = mean(only_viols$n_a_viols),
avg_b_viol = mean(only_viols$n_b_viols),
avg_c_viol = mean(only_viols$n_c_viols)
)
write_csv(all_cases_viol_stats, str_glue("{iwh_dir}/Data/Clean/All Cases/all_cases_viol_stats.csv"), na = "")