-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_icu_data.py
564 lines (498 loc) · 22.6 KB
/
get_icu_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
"""
Code by Maggie Makar
https://github.com/mlhc17mit/pset1materials/blob/master/mort_icu_cleanup.py
"""
# for those interested, the following notebooks are useful to understan
# how the data are organized
# Import libraries
import numpy as np
import pandas as pd
import psycopg2
from scipy.stats import ks_2samp
import os
import random
import py.test
# from bs4 import BeautifulSoup
DIAG_BOOL = True
mimicdir = os.path.expanduser("~/Documents/mimic-data")
random.seed(22891)
# create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'
# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)
#========helper function for imputing missing values
def replace(group):
"""
takes in a pandas group, and replaces the
null value with the mean of the none null
values of the same group
"""
mask = group.isnull()
group[mask] = group[~mask].mean()
return group
#========get the icu details
# this query extracts the following:
# Unique ids for the admission, patient and icu stay
# Patient gender
# admission & discharge times
# length of stay
# age
# ethnicity
# admission type
# in hospital death?
# in icu death?
# one year from admission death?
# first hospital stay
# icu intime, icu outime
# los in icu
# first icu stay?
denquery = \
"""
-- This query extracts useful demographic/administrative information for patient ICU stays
--DROP MATERIALIZED VIEW IF EXISTS icustay_detail CASCADE;
--CREATE MATERIALIZED VIEW icustay_detail as
--ie is the icustays table
--adm is the admissions table
SELECT ie.subject_id, ie.hadm_id, ie.icustay_id
, pat.gender
, adm.admittime, adm.dischtime, adm.diagnosis
, ROUND( (CAST(adm.dischtime AS DATE) - CAST(adm.admittime AS DATE)) , 4) AS los_hospital
, ROUND( (CAST(adm.admittime AS DATE) - CAST(pat.dob AS DATE)) / 365, 4) AS age
, adm.ethnicity, adm.ADMISSION_TYPE
--, adm.hospital_expire_flag
, CASE when adm.deathtime between adm.admittime and adm.dischtime THEN 1 ELSE 0 END AS mort_hosp
, CASE when adm.deathtime between ie.intime and ie.outtime THEN 1 ELSE 0 END AS mort_icu
, CASE when adm.deathtime between adm.admittime and adm.admittime + interval '365' day THEN 1 ELSE 0 END AS mort_oneyr
, DENSE_RANK() OVER (PARTITION BY adm.subject_id ORDER BY adm.admittime) AS hospstay_seq
, CASE
WHEN DENSE_RANK() OVER (PARTITION BY adm.subject_id ORDER BY adm.admittime) = 1 THEN 1
ELSE 0 END AS first_hosp_stay
-- icu level factors
, ie.intime, ie.outtime
, ie.FIRST_CAREUNIT
, ROUND( (CAST(ie.outtime AS DATE) - CAST(ie.intime AS DATE)) , 4) AS los_icu
, DENSE_RANK() OVER (PARTITION BY ie.hadm_id ORDER BY ie.intime) AS icustay_seq
-- first ICU stay *for the current hospitalization*
, CASE
WHEN DENSE_RANK() OVER (PARTITION BY ie.hadm_id ORDER BY ie.intime) = 1 THEN 1
ELSE 0 END AS first_icu_stay
FROM icustays ie
INNER JOIN admissions adm
ON ie.hadm_id = adm.hadm_id
INNER JOIN patients pat
ON ie.subject_id = pat.subject_id
-- LEFT OUTER JOIN diagnoses_icd diag
-- ON ie.subject_id = diag.subject_id and ie.hadm_id = diag.hadm_id
WHERE adm.has_chartevents_data = 1
ORDER BY ie.subject_id, adm.admittime, ie.intime;
"""
den = pd.read_sql_query(denquery,con)
#----drop patients with less than 48 hour
den['los_icu_hr'] = (den.outtime - den.intime).astype('timedelta64[h]')
den = den[(den.los_icu_hr >= 48)]
den = den[(den.age<300)]
den.drop('los_icu_hr', 1, inplace = True)
print 'len of den: %d' % len(den)
# den.isnull().sum()
#----clean up
# micu --> medical
# csru --> cardiac surgery recovery unit
# sicu --> surgical icu
# tsicu --> Trauma Surgical Intensive Care Unit
# NICU --> Neonatal
den['adult_icu'] = np.where(den['first_careunit'].isin(['PICU', 'NICU']), 0, 1)
den['gender'] = np.where(den['gender']=="M", 1, 0)
# h_los_dummies = pd.qcut(den['los_hospital'], [0, .25, .5, .75, 1.],labels=False) +1
# den = pd.concat([den, pd.get_dummies(h_los_dummies, prefix='admlos')], 1)
# icu_los_dummies = pd.qcut(den['los_icu'], [0, .25, .5, .75, 1.],labels=False) +1
# den = pd.concat([den, pd.get_dummies(icu_los_dummies, prefix='iculos')], 1)
# age_dummies = pd.cut(den['age'], [-1,5,10,15,20, 25, 40,60, 80, 200],
# labels = ['l5','5_10', '10_15', '15_20', '20_25', '25_40', '40_60', '60_80', '80p'])
# den = pd.concat([den, pd.get_dummies(age_dummies, prefix='age')], 1)
# get ethnicity group, assuming a certain order arbitrarily
def eth(s):
if s.contains('^white'):
return 'white'
elif s.contains('^black'):
return 'black'
elif s.contains('^hisp') or s.contains('^latin'):
return 'hispanic'
elif s.contains('^asian'):
return 'asian'
else:
return 'other'
# no need to yell
den.ethnicity = den.ethnicity.str.lower()
den.ethnicity.loc[(den.ethnicity.str.contains('^white'))] = 'white'
den.ethnicity.loc[(den.ethnicity.str.contains('^black'))] = 'black'
den.ethnicity.loc[(den.ethnicity.str.contains('^hisp')) | (den.ethnicity.str.contains('^latin'))] = 'hispanic'
den.ethnicity.loc[(den.ethnicity.str.contains('^asia'))] = 'asian'
den.ethnicity.loc[~(den.ethnicity.str.contains('|'.join(['white', 'black', 'hispanic', 'asian'])))] = 'other'
den = pd.concat([den, pd.get_dummies(den['ethnicity'], prefix='eth')], 1)
den = pd.concat([den, pd.get_dummies(den['admission_type'], prefix='admType')], 1)
den.drop(['diagnosis', 'hospstay_seq', 'los_icu','icustay_seq', 'admittime', 'dischtime','los_hospital', 'intime', 'outtime', 'ethnicity', 'admission_type', 'first_careunit'], 1, inplace =True)
#========= 48 hour vitals query
# these are the normal ranges. useful to clean
# up the data
vitquery = \
"""
-- This query pivots the vital signs for the first 48 hours of a patient's stay
-- Vital signs include heart rate, blood pressure, respiration rate, and temperature
-- DROP MATERIALIZED VIEW IF EXISTS vitalsfirstday CASCADE;
-- create materialized view vitalsfirstday as
SELECT pvt.subject_id, pvt.hadm_id, pvt.icustay_id
-- Easier names
, min(case when VitalID = 1 then valuenum else null end) as HeartRate_Min
, max(case when VitalID = 1 then valuenum else null end) as HeartRate_Max
, avg(case when VitalID = 1 then valuenum else null end) as HeartRate_Mean
, min(case when VitalID = 2 then valuenum else null end) as SysBP_Min
, max(case when VitalID = 2 then valuenum else null end) as SysBP_Max
, avg(case when VitalID = 2 then valuenum else null end) as SysBP_Mean
, min(case when VitalID = 3 then valuenum else null end) as DiasBP_Min
, max(case when VitalID = 3 then valuenum else null end) as DiasBP_Max
, avg(case when VitalID = 3 then valuenum else null end) as DiasBP_Mean
, min(case when VitalID = 4 then valuenum else null end) as MeanBP_Min
, max(case when VitalID = 4 then valuenum else null end) as MeanBP_Max
, avg(case when VitalID = 4 then valuenum else null end) as MeanBP_Mean
, min(case when VitalID = 5 then valuenum else null end) as RespRate_Min
, max(case when VitalID = 5 then valuenum else null end) as RespRate_Max
, avg(case when VitalID = 5 then valuenum else null end) as RespRate_Mean
, min(case when VitalID = 6 then valuenum else null end) as TempC_Min
, max(case when VitalID = 6 then valuenum else null end) as TempC_Max
, avg(case when VitalID = 6 then valuenum else null end) as TempC_Mean
, min(case when VitalID = 7 then valuenum else null end) as SpO2_Min
, max(case when VitalID = 7 then valuenum else null end) as SpO2_Max
, avg(case when VitalID = 7 then valuenum else null end) as SpO2_Mean
, min(case when VitalID = 8 then valuenum else null end) as Glucose_Min
, max(case when VitalID = 8 then valuenum else null end) as Glucose_Max
, avg(case when VitalID = 8 then valuenum else null end) as Glucose_Mean
FROM (
select ie.subject_id, ie.hadm_id, ie.icustay_id
, case
when itemid in (211,220045) and valuenum > 0 and valuenum < 300 then 1 -- HeartRate
when itemid in (51,442,455,6701,220179,220050) and valuenum > 0 and valuenum < 400 then 2 -- SysBP
when itemid in (8368,8440,8441,8555,220180,220051) and valuenum > 0 and valuenum < 300 then 3 -- DiasBP
when itemid in (456,52,6702,443,220052,220181,225312) and valuenum > 0 and valuenum < 300 then 4 -- MeanBP
when itemid in (615,618,220210,224690) and valuenum > 0 and valuenum < 70 then 5 -- RespRate
when itemid in (223761,678) and valuenum > 70 and valuenum < 120 then 6 -- TempF, converted to degC in valuenum call
when itemid in (223762,676) and valuenum > 10 and valuenum < 50 then 6 -- TempC
when itemid in (646,220277) and valuenum > 0 and valuenum <= 100 then 7 -- SpO2
when itemid in (807,811,1529,3745,3744,225664,220621,226537) and valuenum > 0 then 8 -- Glucose
else null end as VitalID
-- convert F to C
, case when itemid in (223761,678) then (valuenum-32)/1.8 else valuenum end as valuenum
from icustays ie
left join chartevents ce
on ie.subject_id = ce.subject_id and ie.hadm_id = ce.hadm_id and ie.icustay_id = ce.icustay_id
and ce.charttime between ie.intime and ie.intime + interval '48' hour
-- exclude rows marked as error
and ce.error IS DISTINCT FROM 1
where ce.itemid in
(
-- HEART RATE
211, --"Heart Rate"
220045, --"Heart Rate"
-- Systolic/diastolic
51, -- Arterial BP [Systolic]
442, -- Manual BP [Systolic]
455, -- NBP [Systolic]
6701, -- Arterial BP #2 [Systolic]
220179, -- Non Invasive Blood Pressure systolic
220050, -- Arterial Blood Pressure systolic
8368, -- Arterial BP [Diastolic]
8440, -- Manual BP [Diastolic]
8441, -- NBP [Diastolic]
8555, -- Arterial BP #2 [Diastolic]
220180, -- Non Invasive Blood Pressure diastolic
220051, -- Arterial Blood Pressure diastolic
-- MEAN ARTERIAL PRESSURE
456, --"NBP Mean"
52, --"Arterial BP Mean"
6702, -- Arterial BP Mean #2
443, -- Manual BP Mean(calc)
220052, --"Arterial Blood Pressure mean"
220181, --"Non Invasive Blood Pressure mean"
225312, --"ART BP mean"
-- RESPIRATORY RATE
618,-- Respiratory Rate
615,-- Resp Rate (Total)
220210,-- Respiratory Rate
224690, -- Respiratory Rate (Total)
-- SPO2, peripheral
646, 220277,
-- GLUCOSE, both lab and fingerstick
807,-- Fingerstick Glucose
811,-- Glucose (70-105)
1529,-- Glucose
3745,-- BloodGlucose
3744,-- Blood Glucose
225664,-- Glucose finger stick
220621,-- Glucose (serum)
226537,-- Glucose (whole blood)
-- TEMPERATURE
223762, -- "Temperature Celsius"
676, -- "Temperature C"
223761, -- "Temperature Fahrenheit"
678 -- "Temperature F"
)
) pvt
group by pvt.subject_id, pvt.hadm_id, pvt.icustay_id
order by pvt.subject_id, pvt.hadm_id, pvt.icustay_id;
"""
vit48 = pd.read_sql_query(vitquery,con)
vit48.isnull().sum()
#===============48 hour labs query
# This query does the following:
# it extracts the lab events in the first 48 hours
# it labels the lab items and cleans up their values
# it will create a set of lab values
# 48 hours.
labquery = \
"""
WITH pvt AS (
--- ie is the icu stay
--- ad is the admissions table
--- le is the lab events table
SELECT ie.subject_id, ie.hadm_id, ie.icustay_id, le.charttime
-- here we assign labels to ITEMIDs
-- this also fuses together multiple ITEMIDs containing the same data
, CASE
when le.itemid = 50868 then 'ANION GAP'
when le.itemid = 50862 then 'ALBUMIN'
when le.itemid = 50882 then 'BICARBONATE'
when le.itemid = 50885 then 'BILIRUBIN'
when le.itemid = 50912 then 'CREATININE'
when le.itemid = 50806 then 'CHLORIDE'
when le.itemid = 50902 then 'CHLORIDE'
when le.itemid = 50809 then 'GLUCOSE'
when le.itemid = 50931 then 'GLUCOSE'
when le.itemid = 50810 then 'HEMATOCRIT'
when le.itemid = 51221 then 'HEMATOCRIT'
when le.itemid = 50811 then 'HEMOGLOBIN'
when le.itemid = 51222 then 'HEMOGLOBIN'
when le.itemid = 50813 then 'LACTATE'
when le.itemid = 50960 then 'MAGNESIUM'
when le.itemid = 50970 then 'PHOSPHATE'
when le.itemid = 51265 then 'PLATELET'
when le.itemid = 50822 then 'POTASSIUM'
when le.itemid = 50971 then 'POTASSIUM'
when le.itemid = 51275 then 'PTT'
when le.itemid = 51237 then 'INR'
when le.itemid = 51274 then 'PT'
when le.itemid = 50824 then 'SODIUM'
when le.itemid = 50983 then 'SODIUM'
when le.itemid = 51006 then 'BUN'
when le.itemid = 51300 then 'WBC'
when le.itemid = 51301 then 'WBC'
ELSE null
END AS label
, -- add in some sanity checks on the values
-- the where clause below requires all valuenum to be > 0,
-- so these are only upper limit checks
CASE
when le.itemid = 50862 and le.valuenum > 10 then null -- g/dL 'ALBUMIN'
when le.itemid = 50868 and le.valuenum > 10000 then null -- mEq/L 'ANION GAP'
when le.itemid = 50882 and le.valuenum > 10000 then null -- mEq/L 'BICARBONATE'
when le.itemid = 50885 and le.valuenum > 150 then null -- mg/dL 'BILIRUBIN'
when le.itemid = 50806 and le.valuenum > 10000 then null -- mEq/L 'CHLORIDE'
when le.itemid = 50902 and le.valuenum > 10000 then null -- mEq/L 'CHLORIDE'
when le.itemid = 50912 and le.valuenum > 150 then null -- mg/dL 'CREATININE'
when le.itemid = 50809 and le.valuenum > 10000 then null -- mg/dL 'GLUCOSE'
when le.itemid = 50931 and le.valuenum > 10000 then null -- mg/dL 'GLUCOSE'
when le.itemid = 50810 and le.valuenum > 100 then null -- % 'HEMATOCRIT'
when le.itemid = 51221 and le.valuenum > 100 then null -- % 'HEMATOCRIT'
when le.itemid = 50811 and le.valuenum > 50 then null -- g/dL 'HEMOGLOBIN'
when le.itemid = 51222 and le.valuenum > 50 then null -- g/dL 'HEMOGLOBIN'
when le.itemid = 50813 and le.valuenum > 50 then null -- mmol/L 'LACTATE'
when le.itemid = 50960 and le.valuenum > 60 then null -- mmol/L 'MAGNESIUM'
when le.itemid = 50970 and le.valuenum > 60 then null -- mg/dL 'PHOSPHATE'
when le.itemid = 51265 and le.valuenum > 10000 then null -- K/uL 'PLATELET'
when le.itemid = 50822 and le.valuenum > 30 then null -- mEq/L 'POTASSIUM'
when le.itemid = 50971 and le.valuenum > 30 then null -- mEq/L 'POTASSIUM'
when le.itemid = 51275 and le.valuenum > 150 then null -- sec 'PTT'
when le.itemid = 51237 and le.valuenum > 50 then null -- 'INR'
when le.itemid = 51274 and le.valuenum > 150 then null -- sec 'PT'
when le.itemid = 50824 and le.valuenum > 200 then null -- mEq/L == mmol/L 'SODIUM'
when le.itemid = 50983 and le.valuenum > 200 then null -- mEq/L == mmol/L 'SODIUM'
when le.itemid = 51006 and le.valuenum > 300 then null -- 'BUN'
when le.itemid = 51300 and le.valuenum > 1000 then null -- 'WBC'
when le.itemid = 51301 and le.valuenum > 1000 then null -- 'WBC'
ELSE le.valuenum
END AS valuenum
FROM icustays ie
LEFT JOIN labevents le
ON le.subject_id = ie.subject_id
AND le.hadm_id = ie.hadm_id
-- TODO: they are using lab times 6 hours before the start of the
-- ICU stay.
AND le.charttime between (ie.intime - interval '6' hour)
AND (ie.intime + interval '48' hour)
AND le.itemid IN
(
-- comment is: LABEL | CATEGORY | FLUID | NUMBER OF ROWS IN LABEVENTS
50868, -- ANION GAP | CHEMISTRY | BLOOD | 769895
50862, -- ALBUMIN | CHEMISTRY | BLOOD | 146697
50882, -- BICARBONATE | CHEMISTRY | BLOOD | 780733
50885, -- BILIRUBIN, TOTAL | CHEMISTRY | BLOOD | 238277
50912, -- CREATININE | CHEMISTRY | BLOOD | 797476
50902, -- CHLORIDE | CHEMISTRY | BLOOD | 795568
50806, -- CHLORIDE, WHOLE BLOOD | BLOOD GAS | BLOOD | 48187
50931, -- GLUCOSE | CHEMISTRY | BLOOD | 748981
50809, -- GLUCOSE | BLOOD GAS | BLOOD | 196734
51221, -- HEMATOCRIT | HEMATOLOGY | BLOOD | 881846
50810, -- HEMATOCRIT, CALCULATED | BLOOD GAS | BLOOD | 89715
51222, -- HEMOGLOBIN | HEMATOLOGY | BLOOD | 752523
50811, -- HEMOGLOBIN | BLOOD GAS | BLOOD | 89712
50813, -- LACTATE | BLOOD GAS | BLOOD | 187124
50960, -- MAGNESIUM | CHEMISTRY | BLOOD | 664191
50970, -- PHOSPHATE | CHEMISTRY | BLOOD | 590524
51265, -- PLATELET COUNT | HEMATOLOGY | BLOOD | 778444
50971, -- POTASSIUM | CHEMISTRY | BLOOD | 845825
50822, -- POTASSIUM, WHOLE BLOOD | BLOOD GAS | BLOOD | 192946
51275, -- PTT | HEMATOLOGY | BLOOD | 474937
51237, -- INR(PT) | HEMATOLOGY | BLOOD | 471183
51274, -- PT | HEMATOLOGY | BLOOD | 469090
50983, -- SODIUM | CHEMISTRY | BLOOD | 808489
50824, -- SODIUM, WHOLE BLOOD | BLOOD GAS | BLOOD | 71503
51006, -- UREA NITROGEN | CHEMISTRY | BLOOD | 791925
51301, -- WHITE BLOOD CELLS | HEMATOLOGY | BLOOD | 753301
51300 -- WBC COUNT | HEMATOLOGY | BLOOD | 2371
)
AND le.valuenum IS NOT null
AND le.valuenum > 0 -- lab values cannot be 0 and cannot be negative
LEFT JOIN admissions ad
ON ie.subject_id = ad.subject_id
AND ie.hadm_id = ad.hadm_id
),
ranked AS (
SELECT pvt.*, DENSE_RANK() OVER (PARTITION BY
pvt.subject_id, pvt.hadm_id,pvt.icustay_id,pvt.label ORDER BY pvt.charttime) as drank
FROM pvt
)
SELECT r.subject_id, r.hadm_id, r.icustay_id
, max(case when label = 'ANION GAP' then valuenum else null end) as ANIONGAP
, max(case when label = 'ALBUMIN' then valuenum else null end) as ALBUMIN
, max(case when label = 'BICARBONATE' then valuenum else null end) as BICARBONATE
, max(case when label = 'BILIRUBIN' then valuenum else null end) as BILIRUBIN
, max(case when label = 'CREATININE' then valuenum else null end) as CREATININE
, max(case when label = 'CHLORIDE' then valuenum else null end) as CHLORIDE
, max(case when label = 'GLUCOSE' then valuenum else null end) as GLUCOSE
, max(case when label = 'HEMATOCRIT' then valuenum else null end) as HEMATOCRIT
, max(case when label = 'HEMOGLOBIN' then valuenum else null end) as HEMOGLOBIN
, max(case when label = 'LACTATE' then valuenum else null end) as LACTATE
, max(case when label = 'MAGNESIUM' then valuenum else null end) as MAGNESIUM
, max(case when label = 'PHOSPHATE' then valuenum else null end) as PHOSPHATE
, max(case when label = 'PLATELET' then valuenum else null end) as PLATELET
, max(case when label = 'POTASSIUM' then valuenum else null end) as POTASSIUM
, max(case when label = 'PTT' then valuenum else null end) as PTT
, max(case when label = 'INR' then valuenum else null end) as INR
, max(case when label = 'PT' then valuenum else null end) as PT
, max(case when label = 'SODIUM' then valuenum else null end) as SODIUM
, max(case when label = 'BUN' then valuenum else null end) as BUN
, max(case when label = 'WBC' then valuenum else null end) as WBC
FROM ranked r
WHERE r.drank = 1
GROUP BY r.subject_id, r.hadm_id, r.icustay_id, r.drank
ORDER BY r.subject_id, r.hadm_id, r.icustay_id, r.drank;
"""
lab48 = pd.read_sql_query(labquery,con)
# normal lab ranges, might be useful
# lab_ranges = {'aniongap': [8, 16],
# 'albumin': [3.5, 5.5],
# 'bicarbonate': [22,32],
# 'bilirubin': [0.3, 1.9],
# 'creatinine': [0.4,1.1],
# 'chloride': [96,108],
# 'calcium': [8.4,10.3],
# 'hemoglobin': [11.2,15.7],
# 'lactate': [0.5,2.0],
# 'magnesium': [1.6,2.6],
# 'phosphate': [2.7,4.5],
# 'platelet': [150,400],
# 'potassium': [3.3,5.1],
# 'sodium': [133,145]
# }
#=========notes
notesquery = \
"""
SELECT fin.subject_id, fin.hadm_id, fin.icustay_id, string_agg(fin.text, ' ') as chartext
FROM (
select ie.subject_id, ie.hadm_id, ie.icustay_id, ne.text
from icustays ie
left join noteevents ne
on ie.subject_id = ne.subject_id and ie.hadm_id = ne.hadm_id
and ne.charttime between ie.intime and ie.intime + interval '48' hour
--and ne.iserror != '1'
) fin
group by fin.subject_id, fin.hadm_id, fin.icustay_id
order by fin.subject_id, fin.hadm_id, fin.icustay_id;
"""
notes48 = pd.read_sql_query(notesquery,con)
# clean up the text
# notes48['chartext'] = notes48['chartext'].str.lower()
# notes48['chartext'] = notes48['chartext'].str.replace('\n', '')
# notes48['chartext'] = notes48['chartext'].str.replace('_', '')
#=========icd9 dx
# dxquery = \
# """
# select dx.subject_id, dx.hadm_id, substring(dx.icd9_code, 3) as icd_dx
# from diagnoses_icd dx
# INNER JOIN admissions adm
# ON dx.hadm_id = adm.hadm_id
# and dx.subject_id = adm.subject_id
# """
# dxall = pd.read_sql_query(dxquery,con)
# #========icd9 proc
# procquery = \
# """
# select proc.subject_id, proc.hadm_id, substring(proc.icd9_code, 2) as icd_proc
# from procedures_icd proc
# INNER JOIN admissions adm
# ON proc.hadm_id = adm.hadm_id
# and proc.subject_id = adm.subject_id
# """
# procall = pd.read_sql_query(dxquery,con)
#=====combine all variables
print 'len 2 den: %d' % len(den)
print 'len vit48: %d' % len(vit48)
mort_ds = den.merge(vit48,how = 'left', on = ['subject_id', 'hadm_id', 'icustay_id'])
print 'len mort_ds: %d' % len(mort_ds)
print 'len lab48: %d' % len(lab48)
mort_ds = mort_ds.merge(lab48,how = 'left', on = ['subject_id', 'hadm_id', 'icustay_id'])
#======missing values (following joydeep ghosh's paper)
# create means by age group and gender
mort_ds['age_group'] = pd.cut(mort_ds['age'], [-1,5,10,15,20, 25, 40,60, 80, 200],
labels = ['l5','5_10', '10_15', '15_20', '20_25', '25_40', '40_60', '60_80', '80p'])
mort_ds['had_null'] = mort_ds.isnull().any(axis=1)
py.test.set_trace()
mort_ds = mort_ds.groupby(['age_group', 'gender'])
mort_ds = mort_ds.transform(replace)
# IC: program got stuck at this line. commenting out for now.
# mort_ds.drop('age_group', inplace =True)
# one missing variable
adult_icu = mort_ds[(mort_ds.adult_icu==1)].dropna()
print 'len adult_icu: %d' % len(adult_icu)
# create training and testing labels
# msk = np.random.rand(len(adult_icu)) < 0.7
# adult_icu['train'] = np.where(msk, 1, 0)
adult_icu.to_csv(os.path.join(mimicdir, 'adult_icu_nullinfo_2.gz'), compression='gzip', index = False)
"""
# notes - don't need right now
adult_notes = notes48.merge(adult_icu[['train', 'subject_id', 'hadm_id', 'icustay_id', 'mort_hosp', 'mort_icu', 'mort_oneyr']], how = 'right', on = ['subject_id', 'hadm_id', 'icustay_id'])
adult_notes.to_csv(os.path.join(mimicdir, 'adult_notes.gz'), compression='gzip', index = False)
# don't need NICU
# nicu
nicu_missing = mort_ds[(mort_ds.adult_icu==0)].isnull().sum()/mort_ds[(mort_ds.adult_icu==0)].shape[0]
removes = nicu_missing[(nicu_missing == 1) ]._index
n_icu = mort_ds[(mort_ds.adult_icu==0)].drop(removes, 1).dropna()
# create training and testing
msk = np.random.rand(len(n_icu)) < 0.7
n_icu['train'] = np.where(msk, 1, 0)
fpath = os.path.join(mimicdir, 'n_icu.gz')
n_icu.to_csv(fpath, compression='gzip', index = False)
print 'Created data in %s' % fpath
"""