-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreproc_extract_items_k-mimic.py
117 lines (91 loc) · 4.77 KB
/
preproc_extract_items_k-mimic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
# Defining the paths to the CHARTEVENTS CSV files
file_paths = [
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/001/433/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/001/434/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/001/435/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/002/433/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/002/434/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/002/435/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/006/433/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/006/434/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/008/433/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/008/434/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/008/435/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/010/433/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/010/434/CHARTEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/010/435/CHARTEVENTS.csv'
]
# Define the chunk size
chunk_size = 100 # Number of rows per chunk
max_rows = 800000 # Maximum number of rows to read
# Initialize an empty list to store the processed chunks
all_chunks = []
# Iterate over each file path
for file_path in file_paths:
# Extract the hospital ID from the file path
hospital_id = file_path.split('/')[6]
# Initialize a counter to keep track of the total number of rows read per file
rows_read = 0
# Initialize an empty list to store the chunks for the current file
chunks = []
# Iterate over the CSV file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
# Add the hospital_id column
chunk['HOSPITAL_ID'] = hospital_id
# Check if the total number of rows read has reached the maximum limit
if rows_read + len(chunk) > max_rows:
# Calculate the number of rows needed to reach the maximum limit
remaining_rows = max_rows - rows_read
chunk = chunk.iloc[:remaining_rows]
chunks.append(chunk)
rows_read += len(chunk)
break
else:
# Process the chunk
chunks.append(chunk)
rows_read += len(chunk)
# Delete the chunk to free up memory
del chunk
# Combine the chunks of the current file into a single DataFrame
df = pd.concat(chunks, ignore_index=True)
# Append the DataFrame to the list of all chunks
all_chunks.append(df)
# Combine all DataFrames into a single DataFrame
final_df = pd.concat(all_chunks, ignore_index=True)
# Print the columns to verify the column names
print(final_df.columns)
# Ensure the column 'ITEMID' is in the DataFrame
if 'ITEMID' in final_df.columns:
# Subset only heart data
# Filter rows where 'ITEMID' column contains specific values
search_itemids = ['001C_1187_20765', '001C_1187_20770', '001C_1187_20775']
sce = final_df[final_df['ITEMID'].astype(str).str.contains('|'.join(search_itemids), case=False, na=False)].copy()
print(sce.head())
else:
print("Column 'ITEMID' not found in the DataFrame")
# Free up memory by deleting the original DataFrame
del final_df
########## samples #######
Index(['CHARTEVENT_ID', 'SUBJECT_ID', 'HADM_ID', 'STAY_ID', 'CHARTTIME',
'STORETIME', 'ITEMID', 'VALUE', 'VALUENUM', 'VALUEUOM', 'WARNING',
'HOSPITAL_ID'],
dtype='object')
CHARTEVENT_ID SUBJECT_ID HADM_ID STAY_ID \
1733 11001244676470 ******38639 001109000386394405A09C6048648A 164439.0
1734 11001244676472 ******38639 001109000386394405A09C6048648A 164439.0
1735 11001244676471 ******38639 001109000386394405A09C6048648A 164439.0
1736 11001244681645 ******38639 001109000386394405A09C6048648A 164439.0
1737 11001244681647 ******38639 001109000386394405A09C6048648A 164439.0
CHARTTIME STORETIME ITEMID VALUE \
1733 2411-05-23T00:00:00 2411-05-23T00:14:50 001C_1187_20765 4
1734 2411-05-23T00:00:00 2411-05-23T00:14:50 001C_1187_20770 5
1735 2411-05-23T00:00:00 2411-05-23T00:14:50 001C_1187_20775 E
1736 2411-05-23T01:00:00 2411-05-23T00:51:35 001C_1187_20765 4
1737 2411-05-23T01:00:00 2411-05-23T00:51:35 001C_1187_20770 5
VALUENUM VALUEUOM WARNING HOSPITAL_ID
1733 4.0 NaN 0 001
1734 5.0 NaN 0 001
1735 0.0 NaN 0 001
1736 4.0 NaN 0 001
1737 5.0 NaN 0 001