-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathjamaa_clustering.sql
373 lines (289 loc) · 12.5 KB
/
jamaa_clustering.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* STEP 1: Preparing the points table
a. Duplicate the points table into a table that you can edit so that you maintain original data
b. Create a geometry column for the voters (points) table
c. Populate the geometry column in the native projection
d. Update the geometry column to use the same projection as the Census Files
e. Add a spatial index to the table
*/
-- 1a
DROP TABLE IF EXISTS jamaa_va_export_clustering;
-- 1b
CREATE TABLE jamaa_va_export_clustering
AS (SELECT jva.*
FROM jamaa_va_export_original AS jva);
ALTER TABLE
jamaa_va_export_clustering
ADD COLUMN IF NOT EXISTS geom GEOMETRY(Point, 4326);
-- 1c
UPDATE
jamaa_va_export_clustering
SET geom=ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);
-- 1d
ALTER TABLE jamaa_va_export_clustering
ALTER COLUMN geom TYPE GEOMETRY(Point, 4269) USING ST_Transform(geom, 4269);
-- 1e
DROP INDEX IF EXISTS
jamaa_va_export_clustering_sdx;
CREATE INDEX
jamaa_va_export_clustering_sdx
ON
jamaa_va_export_clustering
USING GIST (geom);
/* STEP 2: Filter out the voters who are in the wrong state, but keep them. This can be done for other geographies.
a. Alter the voters table to add a column for whether the labeled state matches the spatially matched state.
b. Update the state match column with a spatial query
c. Reproject the points table into a projection that uses meters for distance (https://epsg.io/3968)
*/
--2a
ALTER TABLE jamaa_va_export_clustering
DROP COLUMN IF EXISTS hd_label_spatial_matches,
DROP COLUMN IF EXISTS sd_label_spatial_matches,
DROP COLUMN IF EXISTS cd_label_spatial_matches,
DROP COLUMN IF EXISTS county_label_spatial_matches,
DROP COLUMN IF EXISTS state_label_spatial_matches,
ADD COLUMN hd_label_spatial_matches BOOLEAN DEFAULT TRUE,
ADD COLUMN sd_label_spatial_matches BOOLEAN DEFAULT TRUE,
ADD COLUMN cd_label_spatial_matches BOOLEAN DEFAULT TRUE,
ADD COLUMN county_label_spatial_matches BOOLEAN DEFAULT TRUE,
ADD COLUMN state_label_spatial_matches BOOLEAN DEFAULT FALSE;
-- 2b
UPDATE jamaa_va_export_clustering AS v
SET state_label_spatial_matches = 't'
FROM tl_2019_us_state AS state
WHERE TRIM(v.state) = TRIM(state.stusps)
AND ST_CONTAINS(state.geom, v.geom);
-- 2c
ALTER TABLE jamaa_va_export_clustering
ALTER COLUMN geom TYPE GEOMETRY(Point, 3968) USING ST_Transform(geom, 3968);
DROP INDEX IF EXISTS
jamaa_va_export_clustering_sdx;
CREATE INDEX
jamaa_va_export_clustering_sdx
ON
jamaa_va_export_clustering
USING GIST (geom);
/* STEP 3. Create a clustering using DBSCAN
a. Delete rows with values outside the state
b. handled elsewhere
c. Add a column for cluster_id
d. Execute DBSCAN using CLUSTERWITHIN
e. Create a geometry index for the table - tablename_sdx
*/
-- 3a
DELETE
FROM jamaa_va_export_clustering AS jvm
WHERE jvm.state_label_spatial_matches = FALSE;
ALTER TABLE jamaa_va_export_clustering
DROP COLUMN IF EXISTS cluster_id,
ADD COLUMN IF NOT EXISTS cluster_id NUMERIC DEFAULT NULL;
-- 3d
-- This DBSCAN says you need five people together within 20 meters of each other
UPDATE jamaa_va_export_clustering AS jvm
SET cluster_id =
CASE
WHEN sq.cluster_id > -1
THEN sq.cluster_id
ELSE
-1
END
FROM (SELECT jvm.vanid,
ST_ClusterDBSCAN(jvm.geom, eps := 20, minPoints := 5)
OVER (ORDER BY jvm.vanid) AS cluster_id
FROM jamaa_va_export_clustering AS jvm
WHERE TRIM(jvm.racename) = 'African-American'
AND jvm.age > 50
) AS sq
WHERE jvm.vanid = sq.vanid;
-- But statewide clustering is not very useful - we wind up with so many clusters that
-- we have to start partitioning by smaller geographies to make this useful
/* Step 4: Add clustering within smaller geographies (CD, SD, HD)
a. Add necessary columns
b. Execute the queries
*/
-- 4a
ALTER TABLE jamaa_va_export_clustering
DROP COLUMN IF EXISTS cluster_id_within_cd,
ADD COLUMN IF NOT EXISTS cluster_id_within_cd TEXT DEFAULT '-1';
ALTER TABLE jamaa_va_export_clustering
DROP COLUMN IF EXISTS cluster_id_within_sd,
ADD COLUMN IF NOT EXISTS cluster_id_within_sd TEXT DEFAULT '-1';
ALTER TABLE jamaa_va_export_clustering
DROP COLUMN IF EXISTS cluster_id_within_hd,
ADD COLUMN IF NOT EXISTS cluster_id_within_hd TEXT DEFAULT '-1';
-- 4b
-- DBSCAN within CD
UPDATE jamaa_va_export_clustering AS jvm
SET cluster_id_within_cd = CASE
WHEN sq.cluster_id_within_cd IS NOT NULL
THEN
jvm.cd || '-' || jvm.precinctname || '-' || sq.cluster_id_within_cd::TEXT
ELSE '-1'
END
FROM (SELECT jvm.vanid,
ST_ClusterDBSCAN(jvm.geom, eps := 20, minPoints := 5)
OVER (PARTITION BY jvm.cd ORDER BY jvm.vanid) AS cluster_id_within_cd
FROM jamaa_va_export_clustering AS jvm
WHERE TRIM(jvm.racename) = 'African-American'
AND jvm.age > 50
) AS sq
WHERE jvm.vanid = sq.vanid;
-- -- make this easier for boundary estimation later by not having NULL
--
-- UPDATE jamaa_va_export_clustering AS jvm
-- SET cluster_id_within_cd = -1
-- WHERE jvm.cluster_id_within_cd IS NULL;
-- DBSCAN within SD
UPDATE jamaa_va_export_clustering AS jvm
SET cluster_id_within_sd = CASE
WHEN sq.cluster_id_within_sd IS NOT NULL
THEN
jvm.sd || '-' || jvm.precinctname || '-' || sq.cluster_id_within_sd::TEXT
ELSE '-1'
END
FROM (SELECT jvm.vanid,
ST_ClusterDBSCAN(jvm.geom, eps := 20, minPoints := 5)
OVER (PARTITION BY jvm.sd ORDER BY jvm.vanid) AS cluster_id_within_sd
FROM jamaa_va_export_clustering AS jvm
WHERE TRIM(jvm.racename) = 'African-American'
AND jvm.age > 50
) AS sq
WHERE jvm.vanid = sq.vanid;
-- make this easier for boundary estimation later by not having NULL
-- UPDATE jamaa_va_export_clustering AS jvm
-- SET cluster_id_within_sd = -1
-- WHERE jvm.cluster_id_within_sd IS NULL;
-- DBSCAN WITHIN HD
UPDATE jamaa_va_export_clustering AS jvm
SET cluster_id_within_hd = CASE
WHEN sq.cluster_id_within_hd IS NOT NULL
THEN
jvm.hd || '-' || jvm.precinctname || '-' || sq.cluster_id_within_hd::TEXT
ELSE '-1'
END
FROM (SELECT jvm.vanid,
ST_ClusterDBSCAN(jvm.geom, eps := 20, minPoints := 5)
OVER (PARTITION BY jvm.hd ORDER BY jvm.vanid) AS cluster_id_within_hd
FROM jamaa_va_export_clustering AS jvm
WHERE TRIM(jvm.racename) = 'African-American'
AND jvm.age > 50
) AS sq
WHERE jvm.vanid = sq.vanid;
-- UPDATE jamaa_va_export_clustering AS jvm
-- SET cluster_id_within_hd = -1
-- WHERE jvm.cluster_id_within_hd IS NULL;
-- DBSCAN by precinct Within CD
/* Even those smaller geographies aren't very useful, but precincts within them could be.
a. Add columns for cluster id's within smaller geogrpahies, precincts
b. Do a DBScan for each
*/
-- 5a
ALTER TABLE jamaa_va_export_clustering
DROP COLUMN IF EXISTS cluster_id_within_cd_and_precinct,
ADD COLUMN IF NOT EXISTS cluster_id_within_cd_and_precinct TEXT DEFAULT '-1';
ALTER TABLE jamaa_va_export_clustering
DROP COLUMN IF EXISTS cluster_id_within_sd_and_precinct,
ADD COLUMN IF NOT EXISTS cluster_id_within_sd_and_precinct TEXT DEFAULT '-1';
ALTER TABLE jamaa_va_export_clustering
DROP COLUMN IF EXISTS cluster_id_within_hd_and_precinct,
ADD COLUMN IF NOT EXISTS cluster_id_within_hd_and_precinct TEXT DEFAULT '-1';
-- 5b
-- DBSCAN within CD and Precinct
UPDATE jamaa_va_export_clustering AS jvm
SET cluster_id_within_cd_and_precinct = CASE
WHEN sq.cluster_id_within_cd_and_precinct IS NOT NULL
THEN
jvm.hd || '-' || jvm.precinctname || '-' ||
sq.cluster_id_within_cd_and_precinct::TEXT
ELSE '-1'
END
FROM (SELECT jvm.vanid,
ST_ClusterDBSCAN(jvm.geom, eps := 20, minPoints := 5)
OVER (PARTITION BY jvm.cd, jvm.precinctname ORDER BY jvm.vanid) AS cluster_id_within_cd_and_precinct
FROM jamaa_va_export_clustering AS jvm
WHERE TRIM(jvm.racename) = 'African-American'
AND jvm.age > 50
) AS sq
WHERE jvm.vanid = sq.vanid;
-- UPDATE jamaa_va_export_clustering AS jvm
-- SET cluster_id_within_cd_and_precinct = -1
-- WHERE jvm.cluster_id_within_cd_and_precinct IS NULL;
-- DBSCAN within SD and Precinct
UPDATE jamaa_va_export_clustering AS jvm
SET cluster_id_within_sd_and_precinct = CASE
WHEN sq.cluster_id_within_sd_and_precinct IS NOT NULL
THEN
jvm.hd || '-' || jvm.precinctname || '-' ||
sq.cluster_id_within_sd_and_precinct::TEXT
ELSE '-1'
END
FROM (SELECT jvm.vanid,
ST_ClusterDBSCAN(jvm.geom, eps := 20, minPoints := 5)
OVER (PARTITION BY jvm.sd, jvm.precinctname ORDER BY jvm.vanid) AS cluster_id_within_sd_and_precinct
FROM jamaa_va_export_clustering AS jvm
WHERE TRIM(jvm.racename) = 'African-American'
AND jvm.age > 50
) AS sq
WHERE jvm.vanid = sq.vanid;
-- UPDATE jamaa_va_export_clustering AS jvm
-- SET cluster_id_within_sd_and_precinct = -1
-- WHERE jvm.cluster_id_within_sd_and_precinct IS NULL;
-- DBSCAN WITHIN HD and Precinct
UPDATE jamaa_va_export_clustering AS jvm
SET cluster_id_within_hd_and_precinct = CASE
WHEN sq.cluster_id_within_hd_and_precinct IS NOT NULL
THEN
jvm.hd || '-' || jvm.precinctname || '-' ||
sq.cluster_id_within_hd_and_precinct::TEXT
ELSE '-1'
END
FROM (SELECT jvm.vanid,
ST_ClusterDBSCAN(jvm.geom, eps := 20, minPoints := 5)
OVER (PARTITION BY jvm.hd, jvm.precinctname ORDER BY jvm.vanid) AS cluster_id_within_hd_and_precinct
FROM jamaa_va_export_clustering AS jvm
WHERE TRIM(jvm.racename) = 'African-American'
AND jvm.age > 50
) AS sq
WHERE jvm.vanid = sq.vanid;
-- UPDATE jamaa_va_export_clustering AS jvm
-- SET cluster_id_within_hd_and_precinct = -1
-- WHERE jvm.cluster_id_within_hd_and_precinct IS NULL;
/*
Step 6: Create distinct tables for each dataset of clusters. This is really bad boundary estimation because
convex_hull is not parsimonious. You will get something that holds all the points, but it will contain a lot of
extra space. Put another way, if you use convex_hull to get the boundaries of [A,B,C,D], A will contain parts of
[B..D], etc.
This is here for people who can't get boundary estimation from me.
a. CD & Precinct
b. SD & Precinct
c. HD & Precinct
*/
-- 6a
DROP TABLE IF EXISTS jamaa_export_clustering_cd_and_precinct;
CREATE TABLE jamaa_export_clustering_cd_and_precinct AS (
SELECT ROW_NUMBER() OVER () as gid,
jvm.cluster_id_within_cd_and_precinct,
st_convexhull(st_collect(jvm.geom)) AS geom
FROM jamaa_va_export_clustering AS jvm
WHERE jvm.cluster_id_within_cd_and_precinct != '-1'
GROUP BY jvm.cluster_id_within_cd_and_precinct
);
-- 6b
DROP TABLE IF EXISTS jamaa_export_clustering_sd_and_precinct;
CREATE TABLE jamaa_export_clustering_sd_and_precinct AS (
SELECT ROW_NUMBER() OVER () as gid,
jvm.cluster_id_within_sd_and_precinct,
st_convexhull(st_collect(jvm.geom)) AS geom
FROM jamaa_va_export_clustering AS jvm
WHERE jvm.cluster_id_within_sd_and_precinct != '-1'
GROUP BY jvm.cluster_id_within_sd_and_precinct
);
-- 6c
DROP TABLE IF EXISTS jamaa_export_clustering_hd_and_precinct;
CREATE TABLE jamaa_export_clustering_hd_and_precinct AS (
SELECT ROW_NUMBER() OVER () as gid,
jvm.cluster_id_within_hd_and_precinct,
st_convexhull(st_collect(jvm.geom)) AS geom
FROM jamaa_va_export_clustering AS jvm
WHERE jvm.cluster_id_within_hd_and_precinct != '-1'
GROUP BY jvm.cluster_id_within_hd_and_precinct
);