-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvenience_functions.py
442 lines (397 loc) · 21.6 KB
/
convenience_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
"""Docstring for scratch.convenience_functions.py
This module contains convenience functions for plotting and analysing data from the blood supply chain model/simulation.
"""
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
import seaborn as sns
def objectives_stats(file_path) -> list[tuple[float]]:
"""Calculate the mean and standard error of the mean for each objective (column).
:param str file_path: The path to the objectives TSV file.
:return list: A list of tuples where each tuple contains the mean and standard error of the mean for a column.
"""
output_df = pd.read_csv(file_path, sep='\t')
columns = list(output_df.columns)
return_value = []
for column in columns:
return_value.append((output_df[column].mean(),
output_df[column].std() / np.sqrt(len(output_df))))
return return_value
def objectives_stats_table(files: dict, separate_vals_errs=False) -> tuple[pd.DataFrame, tuple[pd.DataFrame, pd.DataFrame]] | pd.DataFrame:
"""
Generates a table of objective statistics from a dictionary of file paths.
Args:
files (dict): A dictionary where keys are identifiers and values are file paths.
separate_vals_errs (bool, optional): If True, returns separate DataFrames for values and errors.
Defaults to False.
Returns:
tuple[DataFrame, tuple[DataFrame, DataFrame]] | DataFrame:
If separate_vals_errs is False, returns a single DataFrame with formatted statistics.
If separate_vals_errs is True, returns a tuple containing:
- A DataFrame with formatted statistics.
- A tuple of two DataFrames: one for values and one for errors.
"""
data_dict = dict()
val_dict = dict()
err_dict = dict()
for key, value in files.items():
if value.endswith('_output.tsv'):
value = value.replace('_output.tsv', '_objectives.tsv')
value = os.path.realpath(os.path.expanduser(value))
objstats = objectives_stats(value)
objstats_str = [f'{x[0]:.3f} ± {x[1]:.4f}' for x in objstats]
data_dict[key] = objstats_str
objstats_arr = np.array(objstats)
val_dict[key] = objstats_arr[:, 0]
err_dict[key] = objstats_arr[:, 1]
df = pd.DataFrame(data_dict, index=['alloimmunisations', 'scd_shortages',
'expiries', 'all_shortages', 'O_neg_level', 'O_pos_level', 'O_level',
'D_subs_num_patients', 'ABO_subs_num_patients', 'ABOD_subs_num_patients'])
if separate_vals_errs:
df_val = pd.DataFrame(val_dict, index=['alloimmunisations', 'scd_shortages',
'expiries', 'all_shortages', 'O_neg_level', 'O_pos_level', 'O_level',
'D_subs_num_patients', 'ABO_subs_num_patients', 'ABOD_subs_num_patients'])
df_err = pd.DataFrame(err_dict, index=['alloimmunisations', 'scd_shortages',
'expiries', 'all_shortages', 'O_neg_level', 'O_pos_level', 'O_level',
'D_subs_num_patients', 'ABO_subs_num_patients', 'ABOD_subs_num_patients'])
return df, (df_val, df_err)
return df
def read_and_average_comp_times(files: dict) -> pd.DataFrame:
"""
Reads computation time files, calculates the mean and standard deviation, and returns a DataFrame.
This function processes a dictionary of file paths, reads the computation times from each file,
calculates the mean and standard deviation for each set of times, and stores the results in a
pandas DataFrame.
Args:
files (dict): A dictionary where keys are identifiers and values are file paths to the computation
time files. The file paths should end with '_output.tsv', which will be replaced
with '_computation_times.tsv' to locate the actual computation time files.
Returns:
DataFrame: A DataFrame with the mean and standard deviation of computation times for each file.
The DataFrame has one row labeled 'computation_time' and columns corresponding to
the keys in the input dictionary. The values are formatted as 'mean ± std_dev'.
"""
data_dict = dict()
for key, value in files.items():
if value.endswith('_output.tsv'):
value = value.replace('_output.tsv', '_computation_times.tsv')
value = os.path.realpath(os.path.expanduser(value))
times = np.loadtxt(value, delimiter='\t')
mean = times.mean()
std_dev = times.std()
data_dict[key] = f'{mean:.0f} ± {std_dev:.1f}'
df = pd.DataFrame(data_dict, index=['computation_time'])
return df
def exp2_figs(abod, limited, extended, prob_neg_phen=1, figsize=None, labels=['ABOD', 'Limited', 'Extended'],
colours=['C0', 'C1', 'C2'], ylabels=None, pdfs=None, skip_plots=False):
"""
Create a series of bar graphs for the expected number of mismatches, substitutions, and alloimmunisations
for the three matching rules.
Args:
abod (str): Path to the ABOD (or the first) file.
limited (str): Path to the Limited (or the second) file.
extended (str): Path to the Extended (or the third) file.
prob_neg_phen (float, optional): Deprecated, fixed to 1. Defaults to 1.
figsize (tuple, optional): Size of the figure. Defaults to None.
labels (list, optional): Labels for the three matching rules.
Defaults to ['ABOD', 'Limited', 'Extended'].
colours (list, optional): Colours for the three matching rules.
Defaults to ['C0', 'C1', 'C2'].
ylabels (list, optional): Labels for the y-axis. Defaults to None.
pdfs (list, optional): Paths to save the figures as PDFs. Defaults to None.
skip_plots (list, optional): List of booleans to skip plots. Defaults to False.
Returns:
None
"""
abod_df = pd.read_csv(abod, sep='\t', index_col=0)
limited_df = pd.read_csv(limited, sep='\t', index_col=0)
extended_df = pd.read_csv(extended, sep='\t', index_col=0)
labels_mismatch = ["A", "B", "D", "C", "c", "E", "e", "K",
"k", "Fya", "Fyb", "Jka", "Jkb", "M", "N", "S", "s"]
labels_allo = labels_mismatch[3:]
x1 = np.arange(len(labels_mismatch))
x2 = np.arange(len(labels_allo))
width = 0.2
x = [x2, x2, x2]
met = ['mismatch', 'subs', 'allo']
ylabel = ['Number of mismatches', 'Expected number of substitutions',
'Expected number of alloimmunisations']
ylabel = ylabels if ylabels is not None else ylabel
title = ['Mismatches in antigen-negative patients by antigen and matching rule',
'Substitutions per unit transfused by antigen and matching rule',
'Alloimmunisations by antigen and matching rule']
xticks = [labels_allo, labels_allo, labels_allo]
cols = xticks
for i in range(len(x)):
if skip_plots is True or skip_plots[i]:
continue
if met[i] == 'subs':
_neg_phen = 1
else:
_neg_phen = prob_neg_phen
fig, ax = plt.subplots(figsize=figsize, dpi=200)
ax.bar(x[i] - width, abod_df.loc[met[i] + '_avg', cols[i]].to_numpy() / _neg_phen, width, label=labels[0],
yerr=abod_df.loc[met[i] + '_stderr', cols[i]].to_numpy() / _neg_phen, color=colours[0])
ax.bar(x[i], limited_df.loc[met[i] + '_avg', cols[i]].to_numpy() / _neg_phen, width, label=labels[1],
yerr=limited_df.loc[met[i] + '_stderr', cols[i]].to_numpy() / _neg_phen, color=colours[1])
ax.bar(x[i] + width, extended_df.loc[met[i] + '_avg', cols[i]].to_numpy() / _neg_phen, width, label=labels[2],
yerr=extended_df.loc[met[i] + '_stderr', cols[i]].to_numpy() / _neg_phen, color=colours[2])
ax.set_ylabel(ylabel[i])
ax.set_xticks(x[i])
ax.set_xticklabels(xticks[i])
if met[i] == 'subs':
ax.legend()
else:
ax.legend()
fig.tight_layout()
plt.show()
if pdfs is not None and pdfs[i] is not None:
fig.savefig(pdfs[i], bbox_inches='tight')
plt.close(fig)
def stacked_stock_levels_graph(datafilename, columns, labels=None, bbox_to_anchor=(1.01, 0.5),
xlabel='Time (days)', ylabel='Level of total stock', figsize=(8, 6), dpi=200,
demarcate_warmup=False, warmup_color='black', raw_data=False,
warmup_period=7*6*3):
"""
Plots a stacked area graph of stock levels over time from a given data file.
Parameters:
datafilename (str): Path to the data file.
If it ends with '_output.tsv', it will be replaced with '_stocks.tsv'.
columns (list): List of column names to be plotted.
labels (list, optional): List of labels for the columns. Defaults to None.
bbox_to_anchor (tuple, optional): Position of the legend. Defaults to (1.01, 0.5).
xlabel (str, optional): Label for the x-axis. Defaults to 'Time (days)'.
ylabel (str, optional): Label for the y-axis. Defaults to 'Level of total stock'.
figsize (tuple, optional): Size of the figure. Defaults to (8, 6).
dpi (int, optional): Dots per inch for the figure. Defaults to 200.
demarcate_warmup (bool, optional): Whether to draw a line to demarcate the warmup period. Defaults to False.
warmup_color (str, optional): Color of the warmup line. Defaults to 'black'.
raw_data (bool, optional): Whether to use raw data for plotting instead of percentages. Defaults to False.
warmup_period (int, optional): Length of the warmup period in days. Defaults to 7×6×3 (i.e., 18 weeks).
Returns:
None: The function creates and displays a plot.
"""
if datafilename.endswith('_output.tsv'):
datafilename = datafilename.replace('_output.tsv', '_stocks.tsv')
df = pd.read_csv(os.path.realpath(
os.path.expanduser(datafilename)), sep='\t')
if raw_data:
df = df[columns] * df['total'].values[:, None]
else:
df = df[columns]
if labels is not None:
df.columns = labels
fig, ax = plt.subplots(dpi=dpi, figsize=figsize)
ax.stackplot(np.array(df.index) + 1, df.T.to_numpy(),
labels=df.columns, alpha=0.75)
ax.legend(loc='center left', bbox_to_anchor=bbox_to_anchor)
ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)
if demarcate_warmup:
draw_warmup_line(ax, color=warmup_color, x=warmup_period+0.5)
if not raw_data:
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
fig.tight_layout()
def plot_age_dist_blood(datafilename, array='age_dist_given_to_scd', age_range=None,
days_range=None, title='Age distribution of blood given to SCD patients over time',
vmax=300, cmap='viridis', figsize=(15, 6), dpi=200, re_index=False,
print_max=False, every_n_xtick=2, plot_avg=False, print_mean=False,
pdf=None) -> pd.DataFrame | None:
"""
Plots the age distribution of blood given to SCD (Sickle Cell Disease) patients over time.
Parameters:
datafilename (str): Path to the data file.
If it ends with '_output.tsv', it will be replaced with '_age_distributions.npz'.
array (str): Key to access the specific age distribution data within the .npz file.
Determines which graph/data to plot.
Access the `.files` attribute of the loaded .npz file to see available keys.
Default is 'age_dist_given_to_scd'.
age_range (array-like, optional): Range of ages to include in the plot.
Default is None, which uses ages 1 to 14.
days_range (array-like, optional): Range of days to include in the plot.
Default is None, which uses days 1 to 210.
title (str): Title of the plot. Default is 'Age distribution of blood given to SCD patients over time'.
vmax (int): Maximum value for the color scale. Default is 300.
cmap (str): Colormap to use for the heatmap. Default is 'viridis'.
figsize (tuple): Size of the figure. Default is (15, 6).
dpi (int): Dots per inch for the figure. Default is 200.
re_index (bool): Whether to re-index the days range. Default is False.
print_max (bool): Whether to print the maximum values in the DataFrame. Default is False.
every_n_xtick (int): Interval for showing x-tick labels. Default is 2.
plot_avg (bool): Whether to plot the average age distribution. Default is False.
print_mean (bool): Whether to print the mean values when plotting the average. Default is False.
pdf (str, optional): Path to save the plot as a PDF. Default is None.
Returns:
out (DataFrame, None): DataFrame of the age distribution if plot_avg is True, otherwise None.
"""
if datafilename.endswith('_output.tsv'):
datafilename = datafilename.replace(
'_output.tsv', '_age_distributions.npz')
data = np.load(os.path.realpath(os.path.expanduser(datafilename)))
age_dist = data[array]
# Mask the day 0 and day 1 data instead of clipping it out
age_range = age_range if age_range is not None else np.arange(1, 15)
days_range = days_range if days_range is not None else np.arange(1, 211)
days_labels = days_range if not re_index else np.arange(
len(days_range)) + 1
arr = age_dist[days_range-1, :]
arr = arr[:, age_range]
sba_df = pd.DataFrame(arr.T, index=age_range, columns=days_labels)
if plot_avg:
plot_avg_age_dist_blood_scd(sba_df, title=title, ylim_max=vmax,
figsize=figsize, dpi=dpi, pdf=pdf, print_mean=print_mean)
return sba_df
# Mask the zero values instead of making them NaNs
fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
sns.heatmap(sba_df.replace(0, np.nan), cmap=cmap, ax=ax,
cbar_kws={'label': 'Number of units'}, vmax=vmax)
ax.set_ylabel('Age (days)')
ax.set_xlabel('Time (days)')
ax.invert_yaxis()
ax.set_title(title)
ax.grid()
# Only show every other xtick label
for i, label in enumerate(ax.xaxis.get_ticklabels()):
if i % every_n_xtick != 0:
label.set_visible(False)
fig.tight_layout()
fig.show()
data.close()
if print_max:
print(sba_df.max(None, skipna=True))
if pdf is not None:
fig.savefig(pdf, bbox_inches='tight')
def plot_avg_age_dist_blood_scd(df, title='Mean age distribution of blood given to SCD patients',
ylim_max=None, figsize=(8, 6), dpi=200, pdf=None, print_mean=False):
"""
Plots the average age distribution of blood given to Sickle Cell Disease (SCD) patients.
Parameters:
df (pd.DataFrame): DataFrame containing the age distribution data.
title (str, optional): Title of the plot. Default is 'Mean age distribution of blood given to SCD patients'.
ylim_max (float, optional): Maximum limit for the y-axis. Default is None.
figsize (tuple, optional): Size of the figure. Default is (8, 6).
dpi (int, optional): Dots per inch (DPI) for the figure. Default is 200.
pdf (str, optional): File path to save the plot as a PDF. Default is None.
print_mean (bool, optional): If True, prints the mean age of the blood units. Default is False.
Returns:
None
"""
fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
df_hist = df.mean(axis=1)
i_min = df_hist.index.min()
i_max = df_hist.index.max()
ax.stairs(df_hist.to_numpy().flatten(), np.arange(
i_min, i_max+2) - 0.5, fill=True, color='gray', alpha=1)
ax.grid()
ax.set_xlabel('Age (days)')
ax.set_ylabel('Number of units')
ax.set_xlim(0, None)
ax.set_ylim(0, ylim_max)
ax.set_title(title)
fig.tight_layout()
if pdf is not None:
fig.savefig(pdf, bbox_inches='tight')
if print_mean:
print(np.average(df_hist.index, weights=df_hist))
def ccdf_avg_age_dist_blood_scd(datafilename, array='age_dist_given_to_scd', age_range=None, days_range=None,
above_age=0, re_index=False) -> float:
"""
Calculate the complementary cumulative distribution function (CCDF) of the average age distribution for blood given to SCD patients.
Parameters:
datafilename (str): The path to the data file.
If it ends with '_output.tsv', it will be replaced with '_age_distributions.npz'.
array (str): The key for the age distribution array in the .npz file. Default is 'age_dist_given_to_scd'.
age_range (array-like, optional): The range of ages to consider. If None, defaults to np.arange(1, 15).
days_range (array-like, optional): The range of days to consider. If None, defaults to np.arange(1, 211).
above_age (int): The age above which to calculate the CCDF. Default is 0.
re_index (bool): Whether to re-index the days range starting from 1. Default is False.
Returns:
float: The CCDF value for the given age distribution and age cut-off.
"""
if datafilename.endswith('_output.tsv'):
datafilename = datafilename.replace(
'_output.tsv', '_age_distributions.npz')
data = np.load(os.path.realpath(os.path.expanduser(datafilename)))
age_dist = data[array]
age_range = age_range if age_range is not None else np.arange(1, 15)
days_range = days_range if days_range is not None else np.arange(1, 211)
days_labels = days_range if not re_index else np.arange(
len(days_range)) + 1
arr = age_dist[days_range-1, :]
arr = arr[:, age_range]
sba_df = pd.DataFrame(arr.T, index=age_range, columns=days_labels)
df_hist = sba_df.mean(axis=1)
i_min = df_hist.index.min()
i_max = df_hist.index.max()
tot_mass = df_hist.loc[i_min:i_max].sum()
mass_above = df_hist[df_hist.index > above_age].sum()
ccdf = mass_above / tot_mass
return ccdf
def draw_warmup_line(ax, x=168.5, color='black', linestyle='--', linewidth=2, text="Warm-up Period", xy=None,
xytext=None):
"""
Draws a vertical line on the given Axes object to indicate a warm-up period and annotates it with text.
Parameters:
ax (matplotlib.axes.Axes): The axes on which to draw the line.
x (float, optional): The x-coordinate for the vertical line. Default is 168.5.
color (str, optional): The color of the line and text. Default is 'black'.
linestyle (str, optional): The style of the line (e.g., '--' for dashed). Default is '--'.
linewidth (int, optional): The width of the line. Default is 2.
text (str, optional): The text to annotate the line with. Default is "Warm-up Period".
xy (tuple, optional): The (x, y) coordinates for the annotation.
Default is None, which calculates the y-coordinate as the mean of the y-axis limits.
xytext (tuple, optional): The (x, y) coordinates for the text position.
Default is None, which places the text slightly to the left of the line.
Returns:
ax (matplotlib.axes.Axes): The modified Axes object with the warm-up line and annotation.
"""
y = np.mean(ax.get_ylim())
if xy is None:
xy = (x, y)
if xytext is None:
xytext = (x * 0.99, y)
ax.axvline(x=x, color=color, linestyle=linestyle, linewidth=linewidth)
ax.annotate(text, xy=xy, xytext=xytext, rotation='vertical',
color=color, ha='right', va='center', textcoords='data')
return ax
def avg_stock_composition(rules, rule_files, donations=None, figsize=(12, 5), dpi=200, ncol=4,
bbox_to_anchor=(0.5, -0.12), pdf=None):
"""
Plots the average stock composition of blood groups based on given rules and rule files.
Parameters:
rules (list): List of rule names to be used as labels in the plot.
rule_files (list): List of file paths containing stock data for each rule.
donations (ndarray, optional): Array of initial donation proportions for each blood group.
Defaults to [14.6, 36.2, 2.8, 7.8, 7.8, 28.4, 0.6, 1.8] for O-, O+, B-, B+, A-, A+, AB-, and AB+ respectively.
figsize (tuple, optional): Size of the figure. Defaults to (12, 5).
dpi (int, optional): Dots per inch for the figure. Defaults to 200.
ncol (int, optional): Number of columns in the legend. Defaults to 4.
bbox_to_anchor (tuple, optional): Bounding box for the legend. Defaults to (0.5, -0.12).
pdf (str, optional): File path to save the figure as a PDF. If None, the figure is not saved. Defaults to None.
Returns:
None
"""
blood_grps = ['O-', 'O+', 'B-', 'B+', 'A-', 'A+', 'AB-', 'AB+']
if donations is None:
donations = np.array([14.6, 36.2, 2.8, 7.8, 7.8, 28.4, 0.6, 1.8])
data = [donations / donations.sum()]
files = []
for rule in rule_files:
stock = pd.read_csv(rule.replace(
'_output.tsv', '_stocks.tsv'), sep='\t')
stock = stock[blood_grps].to_numpy()[-42:].mean(axis=0)
data.append(stock / stock.sum())
data = np.transpose(data)
fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
for i in range(data.shape[0]):
ax.bar(rules, data[i], bottom=np.sum(
data[:i], axis=0), label=blood_grps[i])
ax.set_ylabel('Proportion of Total Stock')
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax.legend(ncol=ncol, bbox_to_anchor=bbox_to_anchor, loc='upper center')
fig.tight_layout()
if pdf is not None:
fig.savefig(pdf, bbox_inches='tight')
plt.show()