-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_labevents_count.py
130 lines (103 loc) · 4.42 KB
/
check_labevents_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
# Defining the paths to the CHARTEVENTS CSV files
file_paths = [
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/001/433/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/001/434/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/001/435/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/002/433/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/002/434/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/002/435/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/006/433/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/006/434/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/008/433/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/008/434/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/008/435/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/010/433/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/010/434/LABEVENTS.csv',
'/mnt/dataset/dataset-2064568781941768192/K-MIMIC/EMR/010/435/LABEVENTS.csv'
]
# Set pandas display options to avoid truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
# Define the chunk size
chunk_size = 100 # Number of rows per chunk
max_rows = 800000 # Maximum number of rows to read
# Initialize an empty list to store the processed chunks
all_chunks = []
# Iterate over each file path
for file_path in file_paths:
# Extract the hospital ID from the file path
hospital_id = file_path.split('/')[6]
# Initialize a counter to keep track of the total number of rows read per file
rows_read = 0
# Initialize an empty list to store the chunks for the current file
chunks = []
# Iterate over the CSV file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
# Add the hospital_id column
chunk['HOSPITAL_ID'] = hospital_id
# Check if the total number of rows read has reached the maximum limit
if rows_read + len(chunk) > max_rows:
# Calculate the number of rows needed to reach the maximum limit
remaining_rows = max_rows - rows_read
chunk = chunk.iloc[:remaining_rows]
chunks.append(chunk)
rows_read += len(chunk)
break
else:
# Process the chunk
chunks.append(chunk)
rows_read += len(chunk)
# Delete the chunk to free up memory
del chunk
# Combine the chunks of the current file into a single DataFrame
df = pd.concat(chunks, ignore_index=True)
# Append the DataFrame to the list of all chunks
all_chunks.append(df)
# Combine all DataFrames into a single DataFrame
final_df = pd.concat(all_chunks, ignore_index=True)
# Print the columns to verify the column names
print(final_df.columns)
# Define the list of codes as a multi-line string in the local codes
raw_items = """
010L9307
010L3007
010L1102
010L8000A
008L30511
001L3092
00201L3095
00201L8186
"""
# Convert the multi-line string to a list, stripping any whitespace
formatted_items = raw_items.strip().split()
# Filter rows where 'ITEMID' column contains specific values
search_itemids = formatted_items
# Ensure the column 'ITEMID' is in the DataFrame
if 'ITEMID' in final_df.columns:
# Make sure ITEMID is treated as a string
final_df['ITEMID'] = final_df['ITEMID'].astype(str)
# Subset only rows where 'ITEMID' exactly matches one of the search_itemids
sce = final_df[final_df['ITEMID'].isin(search_itemids)].copy()
# Group by ITEMID and count the occurrences
itemid_counts = sce['ITEMID'].value_counts()
# Print the counts for each ITEMID
print(itemid_counts)
else:
print("Column 'ITEMID' not found in the DataFrame")
# Free up memory by deleting the original DataFrame
del final_df
#################
Index(['LABEVENT_ID', 'SUBJECT_ID', 'HADM_ID', 'SPECIMEN_ID', 'ITEMID',
'CHARTTIME', 'STORETIME', 'VALUE', 'VALUENUM', 'VALUEUOM',
'REF_RANGE_LOWER', 'REF_RANGE_UPPER', 'FLAG', 'COMMENTS', 'STAY_ID',
'HOSPITAL_ID'],
dtype='object')
ITEMID
001L3092 18799
010L8000A 4358
010L3007 2111
00201L3095 15
Name: count, dtype: int64