-
Notifications
You must be signed in to change notification settings - Fork 0
/
examine_data.py
executable file
·65 lines (52 loc) · 2.32 KB
/
examine_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re
import pandas as pd
def find_in_reports(path, search_strs, find='all', text_col_name='REPORT', max_find=-1, window_size=10):
df = pd.read_csv(path, sep='|')
i = 0
for index, row in df.iterrows():
report = row[text_col_name]
if 0 <= max_find <= i:
break
output = ""
find_results = []
for search_str in search_strs:
result = re.search(search_str, report)
if result is None:
find_results.append(-1)
else:
find_results.append(result.start())
if find == 'all' and all(i >= 0 for i in find_results):
for pos in find_results:
start = max(pos - window_size, 0)
end = min(pos + window_size, len(report))
output += (report[start:end] + '\t')
i += 1
print(index, row['ANON_ID'], output, "\n")
elif find == 'any' and any(i >= 0 for i in find_results):
for pos in find_results:
if pos >= 0:
start = max(pos - window_size, 0)
end = min(pos + window_size, len(report))
output += (report[start:end] + '\t')
i += 1
print(index, row['ANON_ID'], output, "\n")
print("Num matching reports found: {}/{}".format(i, len(df)))
def get_class_counts(path, label_values=None, label_col_name='label'):
# change sep to '|' or ',' depending on file
df = pd.read_csv(path, sep='|')
print(df[label_col_name].unique())
if label_values:
counts = {label: 0 for label in label_values}
else:
label_values = df[label_col_name].unique()
counts = {label: 0 for label in label_values}
for label in df[label_col_name]:
if label in counts:
counts[label] += 1
for label in counts:
print("Class {}: {}/{} ({}%)".format(label, counts[label], len(df), 100.0 * (counts[label] / len(df))))
if __name__ == "__main__":
# find_in_reports('../haruka_pathology_reports_111618.csv', [' yp', '(?<![C])T0', 'N0'])
# find_in_reports('../haruka_pathology_reports_111618.csv', [' yp'])
# find_in_reports('../haruka_radiology_reports_111618.csv', ['T0', 'N0'], text_col_name='NOTE')
get_class_counts('../new_labeled_reports_full_preprocessed.csv', label_col_name='label')