-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering.R
137 lines (124 loc) · 4.59 KB
/
clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
library(tidyverse)
library(lubridate)
# Read data set
d <- read_rds("ev-data-sample.rds")
# Function to clean plaintiff names
clean_pl_names <- function(x) {
str_to_upper({{x}}) |>
str_remove_all( " INC| CO(?=($| ))|LLC|LP|LIMITED PARTNERSHIP|QLA|[[:punct:]]|THE | PHASE.{1,30}$|COMPANY|C\\/O.{1,30}| (AT|OF)(?= )| AND|II|DBA.*") |>
str_replace_all("APT([[:alpha:]]|)|APARTMENT([[:alpha:]]|)$|APARTMENT HOMES", "APARTMENTS") |>
str_replace("MHC|MHP|MOBILE HOME.*", "MOBILE HOMES") |>
str_remove_all(" [[:alpha:]]$") |>
str_squish() |>
str_replace(".*HOUSING AUTH.*", "COUNTY HOUSING AUTHORITY") |>
str_replace("MENTAL HEALTH AS.*", "MENTAL HEALTH ASSOCIATION") |>
str_replace("CHAT.*68.*", "CHATEAU 68 APARTMENTS") |>
str_replace(".*AVONDALE.*", "JA AVONDALE") |>
str_remove("DBA") |>
str_remove("(?<=APARTMENTS).*") |>
str_remove("COMPANIES.*") |>
str_squish()
}
# Create new variable ('plaint_clean') for cleaned plaintiff name
plaintiff <- d |>
mutate(plaint_clean = clean_pl_names(plaintiff))
# Function to plot most frequently appearing values of column ('col')
plaint_plot <- function(col, title) {
ggplot(plaintiff |>
count({{col}}) |>
top_n(10),
aes(x = reorder({{col}}, n), y = n)) +
geom_col(fill = "black") +
geom_text(aes(y = n + 150, label = n)) +
theme_bw() +
coord_flip() +
labs(title = title,
subtitle = "Tulsa County eviction cases, 2016-2022",
x = NULL,
y = NULL) +
ylim(0, 2200)
}
plaint_plot(plaintiff, "Top 10 plaintiff names (raw)")
plaint_plot(plaint_clean, "Top 10 plaintiff names (cleaned)")
# Count distinct values of 'plaintiff' and 'plaint_clean'
pcount <- plaintiff |>
summarize(pl = n_distinct(plaintiff),
clean = n_distinct(plaint_clean))
# Identify serial eviction filings
# Calculate days since last filing with same plaintiff and defendant, filter to evictions filed within 70 days of previous
serial <- plaintiff |>
group_by(plaint_clean, def_id) |>
arrange(plaint_clean, def_id, date_filed) |>
select(district, case_number, date_filed, plaint_clean, def_id) |>
arrange(plaint_clean, def_id) |>
mutate(days_since_last = as.numeric(date_filed) - as.numeric(lag(date_filed))) |>
filter(days_since_last < 70) |>
count(plaint_clean, def_id)
# Group by plaintiff and count number of tenants with 3 or more serial filings
serial_plaintiff <- serial |>
filter(n >= 3) |>
ungroup() |>
count(plaint_clean) |>
top_n(15)
ggplot(serial_plaintiff, aes(x = reorder(plaint_clean, n), y = n)) +
geom_col(fill = "black") +
geom_text(aes(y = n + 6, label = n),
family = "Menlo") +
theme_bw() +
coord_flip() +
labs(title = "Number of tenants with 3+ serial filings",
subtitle = "Tulsa County eviction cases, 2016-2022",
x = NULL,
y = NULL) +
ylim(0, 140)
# Extract debt amounts (available only for cases 2019 on)
# This takes several minutes to run
debt <- d |>
rename(min_desc = description) |>
# Extract debt amount
mutate(min = str_remove_all(min_desc, "AMOUNT IN DEBT OF |\\$|,") |>
str_remove_all("( PER |Document)(.|\n)*") |>
str_squish()) |>
# Clean strings to just digits and decimals
mutate(fees = str_extract_all(min, "(\\d|\\.)+") |>
as.character() |>
str_remove_all('c\\("|"\\)')) |>
# Separate listed fees into columns
separate(fees,
into = paste0("fee", 1:4),
sep = '", "') |>
rowwise(district, case_number, min_desc) |>
# Convert all fee columns to numeric and sum
mutate(across(contains("fee"), as.numeric),
debt_amt = sum(fee1, fee2, fee3, fee4, na.rm = TRUE),
late_fee = str_detect(min_desc, "LATE")) |>
group_by(district, case_number) |>
summarize(debt_amt = sum(debt_amt),
late_fee = any(late_fee == TRUE))
ggplot(debt |>
filter(debt_amt < 5000)) +
geom_histogram(aes(x = debt_amt))
debt |>
left_join(d |>
select(district, case_number, date_filed)) |>
filter(!is.na(minute),
date_filed < ymd("2022-09-01")) |>
count(month = floor_date(date_filed, "month"),
has_debt = debt_amt > 100) |>
group_by(month) |>
mutate(lab = round(n/sum(n) * 100, digits = 0)) |>
filter(has_debt == TRUE) |>
ggplot() +
geom_col(aes(x = month,
y = lab/100),
fill = "black") +
scale_y_continuous(labels = scales::percent) +
theme_bw() +
labs(title = "Percent of eviction filings with debt > $100",
subtitle = "Evictions filed 2019-present, Tulsa County",
x = NULL,
y = NULL)
defs |>
group_by(def_id) |>
count() |>
view()