-
Notifications
You must be signed in to change notification settings - Fork 0
/
addon_to_pls_functions_20241010.py
555 lines (514 loc) · 27.4 KB
/
addon_to_pls_functions_20241010.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
# Copyright Stephane LASSALVY 2024 - Licence GPL-3
#
# Disclaimer of Warranty
# ----------------------
# THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF
# ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
# PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
import pandas as pan
import sklearn as sklearn
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from numpy.polynomial.polynomial import polyfit
import numpy as np
def Inertia(data_frame):
# function : Inertia
# Description : Compute the total variance of a numeric data frame
# Input : data_frame, a numeric pandas data frame
# Return : inertia, the total variance of the data frame / matrix
X = np.array(data_frame)
tX = X.transpose()
# Compute variance covariance matrix of the data, then inertia
varcov_matrix = np.matmul(tX, X) / X.shape[0]
inertia = np.trace(varcov_matrix)
print(f"Matrix inertia : {np.round(inertia, 3)}")
return(inertia)
def cross_validation_loo_overall(Y_data_frame, X_data_frame):
# function : cross_validation_loo_overall
# Description : Compute a cross validation criterion for a PLS model with respect to model dimensions. The procedure use a LOO (leave one out methods)
# Input : Y_data_frame, a numeric pandas data frame for the reponse variables
# X_data_frame, a numeric pandas data frame for the predictor variables
# Return : a plot of the LOO criterion to assess the best model dimension (super parameter)
cv = LeaveOneOut()
PRESS_Vec = []
for i in range(1, X_data_frame.shape[1] + 1):
pls = PLSRegression(n_components = i)
#use LOOCV to evaluate model
scores = cross_val_score(pls, X = X_data_frame, y = Y_data_frame, scoring = 'neg_mean_absolute_error', cv = cv, n_jobs=-1)
PRESS = np.mean(scores**2)
PRESS_Vec.append(PRESS)
t = list(range(1, X_data_frame.shape[1]+1, 1))
fig, ax = plt.subplots()
ax.plot(t, PRESS_Vec, label = "PRESS")
ax.set(xlabel='Number of axes', ylabel='PRESS', title=f"optimal PRESS : {min(np.round(PRESS_Vec, 3))} for {PRESS_Vec.index(min(PRESS_Vec)) + 1} axe(s)")
ax.grid()
plt.legend()
plt.show()
def cross_validation_loo_by_response(Y_data_frame, X_data_frame):
# function : cross_validation_loo_by_response
# Description : Compute a cross validation criterion by response variable for a PLS model with respect to model dimensions.
# The procedure use a LOO (leave one out methods)
# Input : Y_data_frame, a numeric pandas data frame for the reponse variables
# X_data_frame, a numeric pandas data frame for the predictor variables
# Return : a plot of the LOO criterions response by response variable to assess the best model dimension (super parameter)
variables_Y = list(Y_data_frame.columns)
variables_X = list(X_data_frame.columns)
cv = LeaveOneOut()
PRESS_List = []
for variable in variables_Y:
print(f"* Computation for variable {variable}")
PRESS_Vec = []
for i in range(1, X_data_frame.shape[1] + 1):
print(f"- Computation with {i} axe(s)")
pls = PLSRegression(n_components = i)
#use LOOCV to evaluate model
scores = cross_val_score(pls, X = X_data_frame, y = Y_data_frame[variable], scoring = 'neg_mean_absolute_error', cv = cv, n_jobs=-1)
PRESS = np.mean(scores**2)
PRESS_Vec.append(PRESS)
PRESS_List.append(PRESS_Vec)
fig, ax = plt.subplots()
t = list(range(1, X_data_frame.shape[1]+1, 1))
for i in range(0, len(variables_Y)):
print(f"Optimal number of axes for variable n°{i + 1} ({variables_Y[i]}) : {PRESS_List[i].index(min(PRESS_List[i])) + 1}")
ax.plot(t, PRESS_List[i], label = variables_Y[i])
ax.set(xlabel='Number of axes', ylabel='PRESS', title="PRESS for each response variable")
plt.legend()
ax.grid()
plt.show()
def projection(data_frame, axes, varcov = True):
# function : projection
# Description : Compute the projection of a numeric data frame
# on a vector subspace generated by the columns of the data frame axes
# Projection can be made following the euclidean scalar product with metric D = Id,
# or using the variance-covariance metric : D = 1/n * Id, where n is the sample size.
# Input : data_frame, numeric pandas data frame to be projected
# axes, numeric pandas data frame which columns generate the supbspace of projection
# varcov, boolean value, if varcov == True, the scalar product use D = 1/n * Id where n is the sample size,
# if varcov = True, the usual euclidean scalar product is used with D = Id.
# Return : projection the projection of data_frame onto axes
# Y data to be projected, X data to project Y onto
Y = np.array(data_frame)
X = np.array(axes)
# Choosing the metric D
if varcov == True:
den = X.shape[0]
else:
den = 1
# Computing the projection PY = X(tX*X)^{-1}tXDY
tX = X.transpose()
tXDX = np.matmul(tX, X) / den
inv_tXDX = np.linalg.inv(tXDX)
tXDY = np.matmul(tX, Y) / den
inv_tXDX_tXDY = np.matmul(inv_tXDX, tXDY)
projection = np.matmul(X, inv_tXDX_tXDY)
projection = pan.DataFrame(projection, columns = data_frame.columns, index = data_frame.index)
return(projection)
def individuals_map(x_scores, axes_to_plot = [1,2]):
# function : individuals_map
# Description : Plot the projection of the individuals on the factorial plane (t_i, t_j)
# Input : x_scores, numeric pandas data frame of scores
# axes_to_plot, a two elements list [i, j] with the indexes of the scores to use.
# Return : plot of the X data on the subspace generated by (t_i, t_j)
components_to_plot = ["PC" + j for j in map(str, axes_to_plot)]
t_to_plot = ["t" + j for j in map(str, axes_to_plot)]
# Plot the choosen scores
fig, ax = plt.subplots(figsize=(8, 6))
x = x_scores[components_to_plot[0]].to_list()
y = x_scores[components_to_plot[1]].to_list()
ax.scatter(x = x, y = y, color="darkgreen")
ax.set_xlabel(t_to_plot[0])
ax.set_ylabel(t_to_plot[1])
# Annotate the points
for i,txt in enumerate(x_scores.index):
ax.annotate(txt, (x[i] + 0.05, y[i] + 0.05))
# Add horizontal and vertical axes and title
plt.vlines(x = 0, ymin = min(y)-0.5, ymax = max(y)+0.5, color = "black", linestyles = "dashed", facecolor = None)
plt.hlines(y = 0, xmin = min(x)-0.5, xmax = max(x)+0.5, color = "black", linestyles = "dashed", facecolor = None)
plt.title("Factorial map for individuals")
plt.show()
def coordinates(data_frame, axes, varcov = True):
# function : coordinates
# Description : Compute the coordinates of columns of a numeric data frame on the
# axes given by the columns of the data frame axes
# Projection can be made following the euclidean scalar product with metric D = Id,
# or using the variance-covariance metric : D = 1/n * Id, where n is the sample size.
# Input : data_frame, numeric pandas data frame to be projected
# axes, numeric pandas data frame which columns generate the supbspace of projection
# varcov, boolean value, if varcov == True, the scalar product use D = 1/n * Id where n is the sample size,
# if varcov = True, the usual euclidean scalar product is used with D = Id.
# Return : X_coordinates the coordinates of the columns of data_frame on the columns of axes
X = np.array(data_frame)
tX = X.transpose()
scaler = StandardScaler(with_mean= True, with_std = True)
scaled_axes = scaler.fit_transform(axes)
# Choosing the metric D
if varcov == True:
den = X.shape[0]
else:
den = 1
X_coordinates = np.matmul(tX, scaled_axes) / X.shape[0]
X_coordinates = pan.DataFrame(X_coordinates, columns = axes.columns, index = data_frame.columns)
return(X_coordinates)
def variables_map(var_coordinates, axes_to_plot = [1, 2], draw_circle = True):
# function : variables_map
# Description : Plot the projection of the predictors and traget variables
# on the factorial plane (t_i, t_j)
# Input : var_coordinates, numeric pandas data frame of the variable coordinates
# axes_to_plot, a two elements list [i, j] with the indexes of the coordinates to use.# Return : plot of the X and Y variables on the subspace generated by (t_i, t_j)
components_to_plot = ["PC" + j for j in map(str, axes_to_plot)]
t_to_plot = ["t" + j for j in map(str, axes_to_plot)]
if "color" not in var_coordinates.columns:
var_coordinates["color"] = "black"
# Plot a variable factor map for the first two dimensions.
fig, ax = plt.subplots(figsize=(8, 8))
x = var_coordinates[components_to_plot[0]].to_list()
y = var_coordinates[components_to_plot[1]].to_list()
# Draw the arrows for each variable, color them whether they belong to Y or X and annotate them
txt = var_coordinates.index
colors = var_coordinates["color"].to_list()
for i in range(0, var_coordinates.shape[0]):
ax.arrow(0,
0, # Start the arrow at the origin
x[i], #0 for PC1
y[i], #1 for PC2
head_width = 0.01,
head_length = 0.01, )
plt.text(x[i] + 0.05,
y[i] + 0.05,
txt[i], color = colors[i])
# Add label to axes
ax.set_xlabel(t_to_plot[0])
ax.set_ylabel(t_to_plot[1])
if draw_circle == True:
# Add horizontal and vertical axes
plt.vlines(x = 0, ymin = -1, ymax = 1, color = "black", linestyles = "dashed", facecolor = None)
plt.hlines(y = 0, xmin = -1, xmax = 1, color = "black", linestyles = "dashed", facecolor = None)
# Add a unit circle for scale
an = np.linspace(0, 2 * np.pi, 100)
plt.plot(np.cos(an), np.sin(an), color ="black")
plt.axis('equal')
plt.xlim([-1.1, 1.1])
plt.ylim([-1.1, 1.1])
else:
# Add horizontal and vertical axes
plt.vlines(x = 0, ymin = min(y) - 0.1, ymax = max(y) + 0.1, color = "black", linestyles = "dashed", facecolor = None)
plt.hlines(y = 0, xmin = min(x) - 0.1, xmax = max(x) + 0.1, color = "black", linestyles = "dashed", facecolor = None)
plt.xlim([min(x) - 0.1, min(x) + 0.1])
plt.ylim([min(y) - 0.1, min(y) + 0.1])
plt.show()
def fit_PLS_by_dimension(responses_Y, predictors_X):
# function : fit_PLS_by_dimension
# Description : fit several PLS models for dimension = 1, 2, ... to dimension = ncol(predictors_X)
# Input : responses_Y data frame, numeric pandas data frame for the response variables
# responses_X data frame, numeric pandas data frame for the predictor variables
# Return : output, a dictionary gathering the different results :
# 'Dimensions': the different dimensions possible for the PLS models,
# 'Y_inertia': total inertia for predictors Y data frame,
# 'X_inertia': total inertia for predictors X data frame,
# 'Y fitted_list': list of fitted values for Y by models dimension,
# 'X fitted_list': list of fitted values for X by models dimension,
# 'X_fitted_inertia_list': X fitted dataframe inertia by models dimension,
# 'X_fitted_inertia_pct_list': X fitted dataframe inertia in percent by models dimension,
# 'Y_fitted_inertia_list': Y fitted dataframe inertia by models dimension,
# 'Y_fitted_inertia_pct_list': Y fitted dataframe inertia in percent by models dimension,
# 'residuals_list': residuals data frame by model dimentions,
# 'MSE_list': MSE by model dimentions,
# 'RMSE_list': RMSE by model dimensions,
# 'B_coefficients_matrix': data frame B of beta coefficients by model dimensions,
# 'Beta_list': list of beta coefficients by model dimensions
print("debut fonction")
# Reconstruted data with the successive models (Dimension = 1, 2, 3, ..., ncol(X)
Y_fitted_list = []
X_fitted_list = []
# Y and X explained inertia for fitted Y and X, in raw form or in percent
Y_fitted_inertia_list = []
Y_fitted_inertia_pct_list = []
X_fitted_inertia_list = []
X_fitted_inertia_pct_list = []
# Residuals list
residuals_list = []
# MSE and RMSE lists
MSE_list = []
RMSE_list = []
# Regression coefficients
B = pan.DataFrame()
Beta_list = []
# Y and X inertias
print("total variance of matrix Y:")
Y_inertia = Inertia(responses_Y)
print("total variance of matrix X:")
X_inertia = Inertia(predictors_X)
print("Format of predictors matrix Y:")
print(responses_Y.shape)
print("Format of predictors matrix X:")
print(predictors_X.shape)
print("-- Fitting the model for the different dimensions --")
for i in range(1, predictors_X.shape[1] + 1):
print(f"Dimension = {i} :")
print("-----------------")
pls = PLSRegression(n_components = i)
pls.fit(X = predictors_X, y = responses_Y)
#Y_fitted values
Y_fitted_values = pan.DataFrame(pls.predict(predictors_X), index = responses_Y.index, columns= responses_Y.columns)
Y_fitted_list.append(Y_fitted_values)
# t scores of the PLS
x_scores = pls.x_scores_
# Projection of X on the t scores
X_fitted = projection(predictors_X, x_scores)
X_fitted_list.append(X_fitted)
X_fitted_inertia = Inertia(X_fitted)
X_fitted_inertia_pct = X_fitted_inertia / X_inertia * 100
print(f"Y : % of explained inertia : {np.round(X_fitted_inertia_pct, 3)} %")
X_fitted_inertia_list.append(X_fitted_inertia)
X_fitted_inertia_pct_list.append(X_fitted_inertia_pct)
# % Explained inertia for X
Y_fitted_inertia = Inertia(Y_fitted_values)
Y_fitted_inertia_pct = Y_fitted_inertia / Y_inertia * 100
print(f"Y : % of explained inertia : {np.round(Y_fitted_inertia_pct, 3)} %")
Y_fitted_inertia_list.append(Y_fitted_inertia)
Y_fitted_inertia_pct_list.append(Y_fitted_inertia_pct)
# Residuals
residual_matrix = responses_Y - Y_fitted_values
residuals_list.append(residual_matrix)
# MSE and RMSE
MSE = Inertia(residual_matrix)
RMSE = np.sqrt(MSE)
print(f"MSE = {round(MSE, 3)}, RMSE = {np.round(RMSE, 3)}")
MSE_list.append(MSE)
RMSE_list.append(np.sqrt(MSE))
# Regression coefficient for the X predictors
Beta = np.array(pls.coef_)
tBeta = Beta.transpose()
tBeta = pan.DataFrame(tBeta, columns = [j + f"{i}" for j in responses_Y.columns.to_list()])
B = pan.concat([B, tBeta], axis = 1)
Beta_list.append(tBeta)
# Naming B rows
B.index = predictors_X.columns
# Gathering the different results
output = {'Model dimensions': list(range(1, predictors_X.shape[1]+1)),
'Y_shape:': responses_Y.shape,
'Y_variables': responses_Y.columns,
'Y_individuals' : responses_Y.index,
'Y_inertia': Y_inertia,
'X_shape:': predictors_X.shape,
'X_variables': predictors_X.columns,
'X_individuals' : predictors_X.index,
'X_inertia': X_inertia,
'Y_fitted_list': Y_fitted_list,
'X_fitted_list': X_fitted_list,
'X_fitted_inertia_list': X_fitted_inertia_list,
'X_fitted_inertia_pct_list': X_fitted_inertia_pct_list,
'Y_fitted_inertia_list': Y_fitted_inertia_list,
'Y_fitted_inertia_list_pct': Y_fitted_inertia_pct_list,
'residuals_list': residuals_list,
'MSE_list': MSE_list,
'RMSE_list': RMSE_list,
'B_coefficients_matrix': B,
'Beta_list': Beta_list}
return output
def MSE_plot(MSE_list):
# function : MSE_plot
# Description : plot MSE and RMSE vs Model dimension
# Input : MSE_list, list of the MSE for the models with dimension 1 to number of predictors
# Return : plot MSE and RMSE vs Model dimension
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize=(10, 5))
x = range(1, len(MSE_list) + 1)
ax[0].bar(x, MSE_list)
ax[1].bar(x, np.sqrt(MSE_list))
ax[0].tick_params(axis='x', labelrotation = 90)
ax[1].tick_params(axis='x', labelrotation = 90)
ax[0].set(xlabel = 'Model dimension', ylabel = "MSE", title = f"MSE vs Model dimension")
ax[1].set(xlabel = 'Model dimension', ylabel = "RMSE", title = f"RMSE vs Model dimension")
# Show graph
plt.show()
def Beta_vs_dim_plot(Beta_data_frame, response_names):
# function : Beta_vs_dim_plot
# Description : plot predicted vs observation data
# Input : Beta_data_frame data frame, numeric pandas data frame Beta coeefficients
# response_names, list of the response variables names
# Return : plot of the Beta values vs the dimensions of the PLS models
# Plotting coefficients
t = list(range(1, Beta_data_frame.shape[0] + 1))
predictors_names = Beta_data_frame.index.tolist()
for j in response_names:
selection = [f"{j}" + f"{k}" for k in t]
Beta_sel = Beta_data_frame[selection]
Beta_sel_ctrd = scale(Beta_sel, with_mean = True, with_std = False, axis = 0)
Beta_sel_ctrd = pan.DataFrame(Beta_sel_ctrd, columns = Beta_sel.columns, index = Beta_sel.index)
fig, ax = plt.subplots()
for k in range(0, Beta_sel_ctrd.shape[0]):
ax.plot(t, Beta_sel_ctrd.iloc[:, [k]], label = predictors_names[k])
ax.set(xlabel='Number of axes', ylabel='Beta coefficients', title = f"Coefficients for variable {j}")
plt.legend()
ax.grid()
plt.show()
def cumulated_inertia_plot(inertia_list, total_inertia, df = "Y"):
# function : cumulated_inertia_plot
# Description : plot explained inertia vs model dimensions
# Input : inertia_list, list of the explained inertia
# total inertia, value of the inertia of the original data frame
# df, character specifying for which data X of Y the explained inertias are plotted
# Return : plot of the Beta values vs the dimensions of the PLS models
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize=(10, 5))
x = range(1, len(inertia_list) + 1)
# Graph for cumulated inertia
ax[0].bar(x, inertia_list)
ax[0].tick_params(axis='x', labelrotation = 90)
ax[0].set(xlabel='Model dimension', ylabel=f"{df} variance explained", title=f"{df} variance explained in % vs Model dimension")
ax[0].hlines(y = 0.8 * total_inertia, xmin = min(x) - 1, xmax = max(x) + 1, color = "black", linestyles = "dashed", facecolor = None)
# Graph for cumulated inertia in percent
inertia_pct_list = inertia_list / total_inertia * 100
ax[1].bar(x, inertia_pct_list)
ax[1].tick_params(axis='x', labelrotation = 90)
ax[1].set(xlabel='Model dimension', ylabel=f"{df} variance explained in %", title=f"{df} variance explained in % vs Model dimension")
ax[1].hlines(y = 80, xmin = min(x) - 1, xmax = max(x) + 1, color = "black", linestyles = "dashed", facecolor = None)
# Show graph
plt.show()
def coefficients_plot(pls_model, responses_names, predictors_names):
# function : coefficients_plot
# Description : plot explained inertia vs model dimensions
# Input : pls_model model of a given dimension which beta coefficients are to be displayed
# responses_names, index of the names of the response variables
# predictors_names, index of the names of the predictor variables
# Return : for eache response variable, plot of the Beta values vs each predictor variables sorted by absolute values of the coefficients
# Function : coefficients_plot
Coefs = np.array(pls_model.coef_)
# Beta coefficients for all response variables
Beta = Coefs.transpose()
Beta = pan.DataFrame(Beta, columns = responses_names, index = predictors_names)
colors = np.linspace(0, 1, Beta.shape[0])
print("Predictors regression coefficients")
print(Beta)
# Display barplots
for j in responses_names:
predictors_names_tuple = tuple(predictors_names)
values = Beta[j].tolist()
abs_values = np.abs(values)
coefs_j = pan.DataFrame({'predictors': predictors_names_tuple, 'coef': values, 'abs_coef': abs_values})
coefs_j = coefs_j.sort_values(by = ["abs_coef"], ascending = False)
# Create bars
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize=(10, 5))
ax[0].bar(coefs_j["predictors"], coefs_j["coef"])
ax[1].bar(coefs_j["predictors"], coefs_j["abs_coef"])
ax[0].tick_params(axis='x', labelrotation=90)
ax[1].tick_params(axis='x', labelrotation=90)
ax[0].set(xlabel='Predictors', ylabel="Regression coefficients", title=f"Regression coefficients for {j}")
ax[1].set(xlabel='Predictors', ylabel="Absolute value of coefficients", title=f"Absolute value of coefficients for {j}")
# Show graph
plt.show()
def Y_vs_xscores_plot(responses_data_frame, x_scores_data_frame):
# Function Y_vs_xscores_plot
# Description : plot predicted vs observation data
# Input : responses_data_frame, data frame, numeric pandas data frame for the observed response variables
# x_scores_data_frame, data frame of the scores for the predictor variables
# Return : plot of the responses variables vs predictors scores axes
for j in responses_data_frame.columns.tolist():
for k in range(0, x_scores_data_frame.shape[1] - 1):
fig, ax = plt.subplots()
x = x_scores_data_frame.iloc[:, k].tolist()
y = responses_data_frame[j].tolist()
r = np.corrcoef(x, y)[0, 1]
rsquared = r ** 2
ax.scatter(x = x, y = y)
for i,txt in enumerate(x_scores.index):
ax.annotate(txt, (x[i] + 0.05, y[i] + 0.05))
# Regression fit with polyfit
b, m = polyfit(x, y, 1)
# Drawing of the regression line
x_reg = np.linspace(start = min(x), stop = max(x), num = 50)
ax.plot(x_reg, m * x_reg + b)
ax.set(xlabel=f"t axis n° {k+1}", ylabel=f"Variable {j}", title = f"Variable {j} vs X Score t{k+1}, corr = {np.round(r, 3)}, rsquared = {np.round(rsquared, 3)}")
ax.grid()
plt.show()
def yscores_vs_xscores_plot(x_scores_data_frame, y_scores_data_frame):
# Function yscores_vs_xscores_plot
# Description : plot predicted vs observation data
# Input : x_scores_data_frame, data frame of the scores for the predictor variables
# y_scores_data_frame, data frame of the scores for the response variables
# Return : plot of the responses variables vs predictors scores axes
for k in range(0, x_scores_data_frame.shape[1]):
fig, ax = plt.subplots()
x = x_scores_data_frame.iloc[:, k].tolist()
y = y_scores_data_frame.iloc[:, k].tolist()
std_x = np.std(x, ddof = 1)
std_y = np.std(y, ddof = 1)
cov_xy = np.cov(x,y, ddof = 1)[0, 1]
r = np.corrcoef(x, y)[0, 1]
rsquared = r ** 2
print(f"Dimension {k + 1} : cov(t,u) = {np.round(cov_xy, 3)}, corr(t,u) = {np.round(r, 3)}, stdev(t) = {np.round(std_x, 3)}, stdev(u) = {np.round(std_y, 3)}, rsquared(t,u) = {np.round(rsquared, 3)}")
ax.scatter(x = x, y = y)
# Annotate points
for i,txt in enumerate(x_scores.index):
ax.annotate(txt, (x[i] + 0.05, y[i] + 0.05))
# Regression fit with polyfit
b, m = polyfit(x, y, 1)
# Drawing of the regression line
x_reg = np.linspace(start = min(x), stop = max(x), num = 50)
plt.plot(x_reg, m * x_reg + b)
ax.set(xlabel=f"t axis n° {k+1}", ylabel=f"u axis n° {k+1}", title = f"Y scores u{k+1} vs X scores t{k+1}, corr = {np.round(r, 3)}, rsquared = {np.round(rsquared, 3)}")
ax.grid()
plt.show()
def predicted_vs_observed_plot(Y, Y_fitted):
# function : predicted_vs_observed_plot
# Description : plot predicted vs observation data
# Input : Y data frame, numeric pandas data frame for the observed response variables
# Y_fitted data frame, numeric pandas data frame for the predicted response variables
# Return : plot of the predicted values vs the observed values of the response variables
txt = Y.columns
for j in txt:
fix, ax = plt.subplots()
x = Y[j].to_list()
y = Y_fitted[j].to_list()
ax.scatter(x, y)
r = np.corrcoef(x, y)[0,1]
rsquared = r ** 2
for i,txt in enumerate(Y.index):
ax.annotate(txt, (x[i] + 0.05, y[i] + 0.05))
x_reg = np.linspace(min(Y[j]), max(Y[j]), 50)
plt.plot(x_reg, x_reg)
ax.set(xlabel=f"Observed values", ylabel=f"Predicted values", title = f"Predicted vs observed values for {j}, r = {np.round(r, 3)}, rsquared = {np.round(rsquared, 3)}")
plt.show()
def residuals_vs_fitted_values_plot(residuals_data_frame, fitted_values_data_frame):
# Function residuals_vs_fitted_values_plot
# Description : residuals vs predicted values
# Input : residuals_data_frame, data frame of the residuals for each response variables
# fitted_values_data_frame, data frame of the predicted values for each response variables
# Return : for each response variable, plot of the residuals vs the predicted values
for j in residuals_data_frame.columns.tolist():
fig, ax = plt.subplots()
x = fitted_values_data_frame[j].tolist()
y = residuals_data_frame[j].tolist()
individuals = residuals.index.tolist()
ax.scatter(x = x, y = y)
# Annotate points
for i,txt in enumerate(individuals):
ax.annotate(txt, (x[i] + 0.05, y[i] + 0.05))
ax.set(xlabel=f"Fitted values", ylabel=f"Residuals", title = f"Residuals vs fitted for {j}")
# Add horizontal and vertical axes
plt.hlines(y = 0, xmin = min(x)-0.5, xmax = max(x)+0.5, color = "black", linestyles = "dashed", facecolor = None)
# Show graph
plt.show()
# Addon functions for PCA analysis
def variance_ratios_plot(variance_ratios):
# function : variance_ratios_plot
# Description : plot variance_ratios from PCA
# Input : variance ratios list
# Return : plot variance_ratios_ and cumulative variance ratios
cumul_variance_ratios = np.cumsum(variance_ratios)
x = range(1, len(variance_ratios) + 1)
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize=(10, 5))
x = range(1, len(variance_ratios) + 1)
ax[0].bar(x, variance_ratios)
ax[1].bar(x, cumul_variance_ratios)
ax[0].tick_params(axis='x', labelrotation = 90)
ax[1].tick_params(axis='x', labelrotation = 90)
ax[1].hlines(y = 0.80, xmin = min(x) - 1, xmax = max(x) + 1, color = "black", linestyles = "dashed", facecolor = None)
ax[0].set(xlabel = 'Model dimension', ylabel = "Variance ratios", title = f"Variance ratios vs dimension")
ax[1].set(xlabel = 'Model dimension', ylabel = "cum. variance_ratios", title = f"Cum. variance_ratios vs dimension")
# Show graph
plt.show()