-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataframe_processing.py
227 lines (201 loc) · 10.6 KB
/
dataframe_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import numpy as np
import pandas as pd
def derive_imgType(dataframe):
"""
Extracts final image types and lateralities from the defined lists. Adjusts for the different naming convention of GE scanners.
(updated: 7/30/2021)
Sample Usage:
df_new = derive_imgType(df)
"""
# initializing empty lists:
DeriveFlag = [] # ImageLaterality: 0 (not derived), 1 (derived), 2 (need to derive)
ImageLateralityFinal = [] # the image laterality either copied or taken from series description
FinalImageType = [] # final image type: 2D, 3D, cview, ROI_SSC, ROI_SS, Other (not any of the other ones)
for index, row in dataframe.iterrows(): # iterating over all rows
try:
if type(row['SeriesDescription']) == str:
if ' ' in row['SeriesDescription']:
desc = row['SeriesDescription'].split(' ') # splitting up the series description for checking
else:
desc = row['SeriesDescription'].split('_') # GE tags
else:
desc = str(row['SeriesDescription']) # filter for nan
except:
DeriveFlag.append(2) # update derive flag = 0
ImageLateralityFinal.append(np.nan) # update final ImageLaterality
FinalImageType.append('other') # update imagetype
continue
if row['ImageLaterality'] in ['L', 'R']: # if lat exists, it's either 2D or cview or ROI
if (('C-View' in desc or 'CV' in desc)): # if it has C-View or CV it's cview
DeriveFlag.append(0) # update derive flag = 0
ImageLateralityFinal.append(row['ImageLaterality']) # update final ImageLaterality
FinalImageType.append('cview') # update imagetype
elif (('Tomosynthesis' in desc) or ('w/Tomosynthesis' in desc)): # make sure all tomo is accounted for
DeriveFlag.append(1) # update derive flag = 1
ImageLateralityFinal.append(row['ImageLaterality']) # update final ImageLaterality
FinalImageType.append('3D') # update imagetype
else: # if it has lat and is not cview, it's 2d
DeriveFlag.append(0) # update derive flag = 0
ImageLateralityFinal.append(row['ImageLaterality']) # update final ImageLaterality
FinalImageType.append('2D') # update imagetype
else: # if lat doesn't exist, it's either 3D, ROI
if (('Tomosynthesis' in desc) or ('w/Tomosynthesis' in desc)): # if it has Tomosynthesis Reconstruction is 3D
DeriveFlag.append(1) # update derive flag = 1
ImageLateralityFinal.append(desc[0]) # update final ImageLaterality
FinalImageType.append('3D') # update imagetype
elif 'ROUTINE3D' in desc: # adding more flags for GE
DeriveFlag.append(1) # update derive flag = 1
ImageLateralityFinal.append(desc[2][0]) # update final ImageLaterality
FinalImageType.append('3D') # update imagetype
elif 'C-View' in desc: # adding more flags for GE
DeriveFlag.append(1) # update derive flag = 1
ImageLateralityFinal.append(desc[0]) # update final ImageLaterality
FinalImageType.append('cview') # update imagetype
elif 'Capture' in desc: # if it has Secondary Capture, it's SecurView Secondary Capture ROI
DeriveFlag.append(2) # update derive flag = 1
ImageLateralityFinal.append(np.nan) # update final ImageLaterality
FinalImageType.append('ROI_SSC') # update imagetype
elif (('Screen' in desc) or ('SC' in desc)):
DeriveFlag.append(2) # update derive flag = 1
ImageLateralityFinal.append(np.nan) # update final ImageLaterality
FinalImageType.append('ROI_SS') # update imagetype
else:
DeriveFlag.append(2) # update derive flag = 1
ImageLateralityFinal.append(np.nan) # update final ImageLaterality
FinalImageType.append('other') # update imagetype
dataframe['LateralityDeriveFlag'] = DeriveFlag
dataframe['ImageLateralityFinal'] = ImageLateralityFinal
dataframe['FinalImageType'] = FinalImageType
return dataframe
def get_spotmag_flags(df):
"""
Extracts spotmag/special views from the specific tag. (weirdly enough, GE maintains the metadata for spotmag the same as others)
Sample Usage:
df_new = get_spotmag_flags(df)
"""
spot_mags = []
for i in range(len(df)):
if df['0_ViewCodeSequence_0_ViewModifierCodeSequence_CodeMeaning'][i] in ['Spot Compression', 'Magnification']:
spot_mags.append(1)
else:
spot_mags.append(np.nan)
df['spot_mag'] = spot_mags
return df
def correct_paths_PACS_hardcode(dataframe):
"""
Hardcoded path correction (/opt/ssd-data/ to /mnt/PACS_NAS1/)and path splitting (png_path to path and filename)
Sample Usage:
df_new = correct_paths_PACS_hardcode(df)
"""
correct_png_path = []
for i in dataframe.png_path.to_list():
path = i.strip() # remove any whitespace
path = path.replace('//', '/') #remove any double //
path = path.replace('/home', '') # remove /home
path = path.strip() # remove any whitespace
path = path.replace('/opt/ssd-data/', '/mnt/PACS_NAS1/') # remove the opt/ssd-data
#path = path.replace('png_kheiron/cohort_1', 'png_kheiron/kheiron_code/cohort_1')
correct_png_path.append(path)
dataframe['png_path'] = correct_png_path
path = []
filename = []
for i in dataframe.png_path.to_list():
# kheiron cohort csv has weird paths, need to correct a bit to match the hiti server
path.append('/'.join(i.split('/')[:-1]))
filename.append(i.split('/')[-1])
dataframe['path'] = path
dataframe['filename'] = filename
return dataframe
def correct_root_paths(dataframe, root_a, root_b, verbose=False):
"""
Futureproofing with input path correction code. Input root and output roots are given and paths corrected.
Path splitting (png_path to path and filename) still the same implementation.
Sample Usage:
df_new = correct_paths_PACS_hardcode(df, '/opt/ssd-data/', '/mnt/PACS_NAS1/')
"""
if verbose:
print('replacing {} with {}'.format(root_a, root_b))
correct_png_path = []
for i in dataframe.png_path.to_list():
path = i.strip() # remove any whitespace
path = path.replace('//', '/') #remove any double //
path = path.replace('/home', '') # remove /home
path = path.strip() # remove any whitespace
path = path.replace(root_a, root_b) # remove the opt/ssd-data
correct_png_path.append(path)
dataframe['corrected_png_path'] = correct_png_path
path = []
filename = []
for i in dataframe.corrected_png_path.to_list():
# kheiron cohort csv has weird paths, need to correct a bit to match the hiti server
path.append('/'.join(i.split('/')[:-1]))
filename.append(i.split('/')[-1])
dataframe['folder_path'] = path
dataframe['filename'] = filename
return dataframe
def match_roi(main_df, roi_df):
"""
gets the main dataframe and ROI dataframe and merges the correct ROIs to the right column on the main dataframe
*need to update this to also merge the screen save rois
"""
# gets the string path of the matching mammo
roi_df['matching_path'] = roi_df['matching_path'].apply(lambda x: eval(x)[0] if eval(x) else "" )
mammo_roi = []
# going through all the main df and checking for ROI df if ROI exists
for i in range(len(main_df)):
roi = roi_df['ROI_coord'][roi_df['matching_path'] == main_df['png_path'].iloc[i]]
if roi.empty:
mammo_roi.append([])
else:
mammo_roi.append([eval(t)[0] for t in roi ]) # converts ROI string into ROI coordinates
main_df['ROI_coord'] = mammo_roi
return main_df
def load_simple_df(df_path, df_type=None, verbose=False):
"""
loads a "simple" dataframe with just the fields necessary for ROI extraction, ROI coordinates merging or image laterality/spotmag identification
Sample Usage:
df = load_simple_df('/../../example_metadata.csv') OR load_simple_df('/../../example_metadata.csv', 'ROI_extract')
*doublecheck and finetune this code
"""
if df_type == 'ROI_merge':
if verbose:
print('Extracting a simple dataframe with just [png_path, Matching_Mammo, ROI_coord] columns to merge to metadata')
df = pd.read_csv(df_path, dtype='str', low_memory=False)
df = df[['png_path', 'Matching_Mammo', 'ROI_coord']]
df['ROI_coord'] = df['ROI_coord'].apply(lambda x: eval(x)[0] if eval(x) else [])
elif df_type == 'ROI_extract':
if verbose:
print('Extracting a simple dataframe with just [png_path, FinalImageType, filename, folder_path] columns to extract ROIs from')
df = pd.read_csv(df_path, dtype='str', low_memory=False)
df = df[['png_path', 'FinalImageType', 'filename', 'folder_path']]
else:
if verbose:
print('Extracting a simple dataframe with just [png_path, FinalImageType, filename, folder_path] columns to extract ROIs from')
df = pd.read_csv(df_path, dtype='str', low_memory=False)
df = df[['SeriesDescription', 'ImageLaterality', '0_ViewCodeSequence_0_ViewModifierCodeSequence_CodeMeaning', 'filename', 'folder_path']]
return df
def replace_old_png_path(df, verbose=False):
"""
drops the old png_path column with '/opt/ssd-data/' and renames the 'correct_png_path' column to 'png_path'
"""
df.drop(columns=['png_path'], inplace=True)
df.rename(columns={'corrected_png_path': 'png_path'}, inplace=True)
if verbose:
print('dropped old png_path, renamed correct_png_path to png_path')
def make_screensave_dict(df):
"""
extracts the filenames for each screensave images and makes a dictionary of the {screensave_filename:folder}
"""
ssc_dict = {}
df_ssc_fn = df[df.FinalImageType == 'ROI_SSC']['filename'].to_list()
df_ssc_path = df[df.FinalImageType == 'ROI_SSC']['folder_path'].to_list()
for i in range(len(df_ssc_fn)):
ssc_dict[df_ssc_fn[i]] = df_ssc_path[i]
return ssc_dict
def read_df(path_to_csv):
"""
reads csv and load the dataframe correctly so that ROIs are not a string
"""
df = pd.read_csv(path_to_csv, low_memory=False)
df['ROI_coord'] = df['ROI_coord'].apply(lambda x: eval(x)[0] if eval(x) else [])
return df