diff --git a/model_training/primitive_vs_metastasis_model.py b/model_training/primitive_vs_metastasis_model.py new file mode 100644 index 0000000..3e16cb1 --- /dev/null +++ b/model_training/primitive_vs_metastasis_model.py @@ -0,0 +1,809 @@ +""" +In this file we investigate the training of a model to predict survival in the following cases: + - for primitive patients + - for metastasis patients + - for all patients but with a feature indicating whether the patient is primitive or metastasis + +Args: + --merged-data: Path to the csv file containing the merged data + --output-dir: Path to the directory where the model will be saved + +Returns: + None + +Example: + python model_training/primitif_vs_metastase_model.py --merged-data data/merged_data.csv --output-dir models/ + +Author: Pierre-Louis Benveniste +""" +import pandas as pd +import argparse +import matplotlib.pyplot as plt +from xgboost import XGBClassifier +from sklearn.metrics import roc_auc_score, auc, brier_score_loss, precision_recall_curve, accuracy_score, roc_curve, precision_score, recall_score, confusion_matrix +from sklearn.feature_selection import VarianceThreshold + + +def parse_args(): + """ + This function is used to parse the arguments given to the script + """ + parser = argparse.ArgumentParser(description='Preprocess the data from the lung cancer response dataset') + parser.add_argument('--input', type=str, help='Path to the csv file containing the merged data') + parser.add_argument('--output', type=str, help='Path to the folder where the results will be saved') + return parser.parse_args() + + +def main(): + """ + This is the main function of the script. + It does the training and evaluation of the survival model. + """ + + # We parse the arguments + args = parse_args() + input_data = args.input + output_folder = args.output + + ######################################################################## + #################### DATA PREPROCESSING ################################ + ######################################################################## + + # Load the dataset + data = pd.read_csv(input_data) + + # We remove data which is note useful to make the averaging easier + data = data.drop(columns=['DDD', 'cause_DC', 'Date_R_PTV', 'Date_R_homo', 'Date_R_med', 'Date_R_contro', 'Date_R_horspoum', 'Reponse', 'delai_fin_rechutePTV', 'delai_fin_rechuteHomo', + 'delai_fin_rechuteMed', 'delai_fin_rechuteContro', 'delai_fin_rechuteHorspoum','subject_nodule', 'nodule' ]) + + + # We average the columns for the same patients across the different nodules + data_grouped = data.groupby('subject_id').mean().reset_index() + + # We build the column relapse which takes 1 if either of the relapse occur ('rechute_PTV', 'rechute_homo', 'rechute_med', 'rechute_contro', 'rechute_horspoum') + data_grouped['relapse'] = data_grouped['rechute_PTV'] + data_grouped['rechute_homo'] + data_grouped['rechute_med'] + data_grouped['rechute_contro'] + data_grouped['rechute_horspoum'] + # Modify the 'relapse' column to take value 1 if it is above 0 + data_grouped['relapse'] = data_grouped['relapse'].apply(lambda x: 1 if x > 0 else 0) + + # Print number of primitive patients and number of metastasis + print("Total number of patients", data_grouped.shape[0]) + print("Number of primitive patients", data_grouped[data_grouped['relapse']==0].shape[0]) + print("Number of metastasis patients", data_grouped[data_grouped['relapse']!=0].shape[0]) + print("\n") + + # We extract the site where the subjects are from which is the first letter of the subject_id + data_grouped['site'] = data_grouped['subject_id'].apply(lambda x: x[0]) + + # We build a column which indicates if it is used as training or testing data (testing if the subject is from site 'V') + data_grouped['train_test'] = data_grouped['site'].apply(lambda x: 'train' if x != 'V' else 'test') + + ################################################################################################ + ######## FIRST SCENARIO : MODEL FOR PREDICTION OF SURVIVAL FOR PRIMITIVE PATIENTS ############## + ################################################################################################ + + print(" ------------- Model for prediction of survival for primitive patients -------------") + + # We remove patients that are metastasis (have relapse different from 0) + data_primitive = data_grouped[data_grouped['relapse'] == 0] + + # Split into features and target + y = data_primitive[['DC', 'train_test']] + x = data_primitive.drop(columns=['DC', 'delai_fin_DC', 'subject_id', 'site', 'relapse']) + + # In this case, because we are only interested in the prediction of survival, we extract only the 'DC' + y = y[['DC', 'train_test']] + # We replace all nan values by 0 in 'DC' + y.fillna(0, inplace=True) + + # Describe x and y + print('Feature data shape:', x.shape) + print('Target data shape:', y.shape) + print("Number of subjects which died:", y[y['DC']==1].shape[0]) + + # Split the data into training and testing sets based on column 'train_test' + x_train = x[x['train_test'] == 'train'].drop(columns=['train_test']) + x_test = x[x['train_test'] == 'test'].drop(columns=['train_test']) + y_train = y[y['train_test'] == 'train'].drop(columns=['train_test']) + y_test = y[y['train_test'] == 'test'].drop(columns=['train_test']) + # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) + print("\nNumber of subject for training:", x_train.shape[0]) + print("Number of subject for testing:", x_test.shape[0]) + print("\n") + + # Plot the distribution of 'delai_fin_DC' + data_primitive['delai_fin_DC'].hist() + plt.title('Distribution of survival time between end of treatment and\n death (in days) for those that died') + plt.xlabel('Survival time') + plt.ylabel('Number of subjects') + # plt.show() + + # Now we consider that every person that die after 3 year is considered as not dead (using the delai_fin_DC column) + y_deadline_3_year = data_primitive[['delai_fin_DC', 'train_test']] + y_deadline_3_year.fillna(1096, inplace=True) + y_deadline_3_year['delai_fin_DC'] = y_deadline_3_year['delai_fin_DC'].apply(lambda x: 0 if x > 1095 else 1) + print("Number of subjects that died within 3 year:", y_deadline_3_year[y_deadline_3_year['delai_fin_DC'] == 1].shape[0]) + + # Split the data into training and testing sets + y_train = y_deadline_3_year[y_deadline_3_year['train_test'] == 'train'].drop(columns=['train_test']) + y_test = y_deadline_3_year[y_deadline_3_year['train_test'] == 'test'].drop(columns=['train_test']) + print("\nNumber of subjects that died within 3 year (train):", y_train[y_train['delai_fin_DC'] == 1].shape[0]) + print("Number of subjects that died within 3 year (test):", y_test[y_test['delai_fin_DC'] == 1].shape[0]) + + # Initialise the model + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the training set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"\nModel performance on the deadline of 3 year with {x.shape[1]} features") + print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score:",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + # We now perform feature selection + + # Now we move on to training the model with no deadline with fewer features based on what we explored in file `data_preprocessing/5_eliminating_radiomics_features.py` + # We keep the clinical features + clinical_features = ['sexe', 'age', 'BMI', 'score_charlson', 'OMS', 'tabac', 'tabac_PA', 'tabac_sevre', 'histo', 'T', 'centrale', 'dose_tot', 'etalement', + 'vol_GTV', 'vol_PTV', 'vol_ITV', 'couv_PTV', 'BED_10'] + # We keep some radiomics features: selected here: + radiomics_features = ['INTENSITY-BASED_MeanIntensity', 'INTENSITY-BASED_IntensitySkewness', 'INTENSITY-BASED_IntensityKurtosis', 'INTENSITY-BASED_10thIntensityPercentile', + 'INTENSITY-BASED_AreaUnderCurveCIVH', 'INTENSITY-BASED_RootMeanSquareIntensity', 'INTENSITY-HISTOGRAM_IntensityHistogramMean', + 'INTENSITY-HISTOGRAM_IntensityHistogramVariance', 'NGTDM_Complexity', 'NGTDM_Strength'] + # Join the two lists + x = data_primitive[clinical_features + radiomics_features + ['train_test']] + y = data_primitive[['train_test', 'delai_fin_DC']] + # We replace all nan values by 0 in 'DC' + y.fillna(0, inplace=True) + + # In column 'delai_fin_DC', we replace all nan values by 1096 (3 years) + y['delai_fin_DC'].fillna(1096, inplace=True) + y['delai_fin_DC'] = y['delai_fin_DC'].apply(lambda x: 0 if x > 1095 else 1) + + # Renane column 'delai_fin_DC' to 'DC' + y = y.rename(columns={'delai_fin_DC': 'DC'}) + + # Split the data into training and testing sets + x_train = x[x['train_test'] == 'train'].drop(columns=['train_test']) + x_test = x[x['train_test'] == 'test'].drop(columns=['train_test']) + y_train = y[y['train_test'] == 'train'].drop(columns=['train_test']) + y_train = y_train['DC'] + y_test = y[y['train_test'] == 'test'].drop(columns=['train_test']) + y_test = y_test['DC'] + print("\n --- Feature selection ---") + print("Initial number of features:", x_train.shape[1]) + print("Number of subject for training:", x_train.shape[0]) + print("Number of subject for testing:", x_test.shape[0]) + print("\n") + + # Initialise the model + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the training set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance with 3 year deadline with {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + #Now we perform variance thresholding + shape_ini = x_train.shape + selector = VarianceThreshold(threshold=0.3) + selector.fit(x_train) + # Extract the names of the columns to remove + columns_to_remove_var_thresh = x_train.columns[~selector.get_support()] + # We remove these columns from the data + x_train = x_train.drop(columns=columns_to_remove_var_thresh) + x_test = x_test.drop(columns=columns_to_remove_var_thresh) + print("Number of features after variance thresholding:", x_train.shape[1]) + print("Number of features removed by variance thresholding:", shape_ini[1]-x_train.shape[1]) + print("\n") + + # We evaluate the model after variance thresholding + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the test set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance after variance thresholding with 3 year deadline and {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + # We now perform feature selection based on the correlation with other features + corr_matrix = x_train.corr() + columns = corr_matrix.columns + columns_to_drop_corr_feat = [] + for i in range(len(columns)): + for j in range(i + 1, len(columns)): + if corr_matrix.loc[columns[i], columns[j]] > 0.80: + columns_to_drop_corr_feat.append(columns[j]) + columns_to_drop_corr_feat = set(columns_to_drop_corr_feat) + print("Number of features after correlation thresholding:", x_train.shape[1]-len(columns_to_drop_corr_feat)) + print("Number of features removed by correlation thresholding:", len(columns_to_drop_corr_feat)) + print("\n") + + # # Remove the columns + x_train = x_train.drop(columns = list(columns_to_drop_corr_feat)) + x_test = x_test.drop(columns = list(columns_to_drop_corr_feat)) + + # We evaluate the model after feature selection based on correlation + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the test set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance after feature selection based on correlation (3-year deadline) with {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + # We now perform feature selection based on the correlation with the target variable + corr_matrix = x_train.corrwith(y_train) + columns = corr_matrix.index + columns_to_drop_corr_target = [] + for i in range(len(columns)): + if abs(corr_matrix.iloc[i]) < 0.014: + columns_to_drop_corr_target.append(columns[i]) + columns_to_drop_corr_target = set(columns_to_drop_corr_target) + print("Number of features after correlation with target thresholding:", x_train.shape[1]-len(columns_to_drop_corr_target)) + print("Number of features removed by correlation with target thresholding:", len(columns_to_drop_corr_target)) + print("\n") + + # Remove the columns + x_train = x_train.drop(columns = list(columns_to_drop_corr_target)) + x_test = x_test.drop(columns = list(columns_to_drop_corr_target)) + + # Print the final features of the model + print("Final features of the model:") + print(list(x_train.columns)) + print("\n") + + # We evaluate the model after feature selection based on correlation with the target variable + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the test set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance after feature selection based on correlation with target (3-year deadline) with {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + # Print confusion matrix + print("Confusion matrix:") + tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel() + print("TN:", tn) + print("FP:", fp) + print("FN:", fn) + print("TP:", tp) + print("\n") + + + ################################################################################################ + ######## SECOND SCENARIO : MODEL FOR PREDICTION OF SURVIVAL FOR METASTASIS PATIENTS ############ + ################################################################################################ + + print(" ------------- Model for prediction of survival for metastasis patients -------------") + + # We remove patients that are metastasis (have relapse different from 0) + data_metastasis = data_grouped[data_grouped['relapse'] == 1] + + # Split into features and target + y = data_metastasis[['DC', 'train_test']] + x = data_metastasis.drop(columns=['DC', 'delai_fin_DC', 'subject_id', 'site', 'relapse']) + + # In this case, because we are only interested in the prediction of survival, we extract only the 'DC' + y = y[['DC', 'train_test']] + # We replace all nan values by 0 in 'DC' + y.fillna(0, inplace=True) + + # Describe x and y + print('Feature data shape:', x.shape) + print('Target data shape:', y.shape) + print("Number of subjects which died:", y[y['DC']==1].shape[0]) + + # Split the data into training and testing sets based on column 'train_test' + x_train = x[x['train_test'] == 'train'].drop(columns=['train_test']) + x_test = x[x['train_test'] == 'test'].drop(columns=['train_test']) + y_train = y[y['train_test'] == 'train'].drop(columns=['train_test']) + y_test = y[y['train_test'] == 'test'].drop(columns=['train_test']) + # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) + print("\nNumber of subject for training:", x_train.shape[0]) + print("Number of subject for testing:", x_test.shape[0]) + print("\n") + + # Plot the distribution of 'delai_fin_DC' + data_metastasis['delai_fin_DC'].hist() + plt.title('Distribution of survival time between end of treatment and\n death (in days) for those that died') + plt.xlabel('Survival time') + plt.ylabel('Number of subjects') + # plt.show() + + # Now we consider that every person that die after 3 year is considered as not dead (using the delai_fin_DC column) + y_deadline_3_year = data_metastasis[['delai_fin_DC', 'train_test']] + y_deadline_3_year.fillna(1096, inplace=True) + y_deadline_3_year['delai_fin_DC'] = y_deadline_3_year['delai_fin_DC'].apply(lambda x: 0 if x > 1095 else 1) + print("Number of subjects that died within 3 year:", y_deadline_3_year[y_deadline_3_year['delai_fin_DC'] == 1].shape[0]) + + # Split the data into training and testing sets + y_train = y_deadline_3_year[y_deadline_3_year['train_test'] == 'train'].drop(columns=['train_test']) + y_test = y_deadline_3_year[y_deadline_3_year['train_test'] == 'test'].drop(columns=['train_test']) + print("\nNumber of subjects that died within 3 year (train):", y_train[y_train['delai_fin_DC'] == 1].shape[0]) + print("Number of subjects that died within 3 year (test):", y_test[y_test['delai_fin_DC'] == 1].shape[0]) + + # Initialise the model + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the training set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"\nModel performance on the deadline of 3 year with {x.shape[1]} features") + print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score:",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + # We now perform feature selection + + # Now we move on to training the model with no deadline with fewer features based on what we explored in file `data_preprocessing/5_eliminating_radiomics_features.py` + # We keep the clinical features + clinical_features = ['sexe', 'age', 'BMI', 'score_charlson', 'OMS', 'tabac', 'tabac_PA', 'tabac_sevre', 'histo', 'T', 'centrale', 'dose_tot', 'etalement', + 'vol_GTV', 'vol_PTV', 'vol_ITV', 'couv_PTV', 'BED_10'] + # We keep some radiomics features: selected here: + radiomics_features = ['INTENSITY-BASED_MeanIntensity', 'INTENSITY-BASED_IntensitySkewness', 'INTENSITY-BASED_IntensityKurtosis', 'INTENSITY-BASED_10thIntensityPercentile', + 'INTENSITY-BASED_AreaUnderCurveCIVH', 'INTENSITY-BASED_RootMeanSquareIntensity', 'INTENSITY-HISTOGRAM_IntensityHistogramMean', + 'INTENSITY-HISTOGRAM_IntensityHistogramVariance', 'NGTDM_Complexity', 'NGTDM_Strength'] + # Join the two lists + x = data_metastasis[clinical_features + radiomics_features + ['train_test']] + y = data_metastasis[['train_test', 'delai_fin_DC']] + # We replace all nan values by 0 in 'DC' + y.fillna(0, inplace=True) + + # In column 'delai_fin_DC', we replace all nan values by 1096 (3 years) + y['delai_fin_DC'].fillna(1096, inplace=True) + y['delai_fin_DC'] = y['delai_fin_DC'].apply(lambda x: 0 if x > 1095 else 1) + + # Renane column 'delai_fin_DC' to 'DC' + y = y.rename(columns={'delai_fin_DC': 'DC'}) + + # Split the data into training and testing sets + x_train = x[x['train_test'] == 'train'].drop(columns=['train_test']) + x_test = x[x['train_test'] == 'test'].drop(columns=['train_test']) + y_train = y[y['train_test'] == 'train'].drop(columns=['train_test']) + y_train = y_train['DC'] + y_test = y[y['train_test'] == 'test'].drop(columns=['train_test']) + y_test = y_test['DC'] + print("\n --- Feature selection ---") + print("Initial number of features:", x_train.shape[1]) + print("Number of subject for training:", x_train.shape[0]) + print("Number of subject for testing:", x_test.shape[0]) + print("\n") + + # Initialise the model + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the training set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance with 3 year deadline with {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + #Now we perform variance thresholding + shape_ini = x_train.shape + selector = VarianceThreshold(threshold=0.3) + selector.fit(x_train) + # Extract the names of the columns to remove + columns_to_remove_var_thresh = x_train.columns[~selector.get_support()] + # We remove these columns from the data + x_train = x_train.drop(columns=columns_to_remove_var_thresh) + x_test = x_test.drop(columns=columns_to_remove_var_thresh) + print("Number of features after variance thresholding:", x_train.shape[1]) + print("Number of features removed by variance thresholding:", shape_ini[1]-x_train.shape[1]) + print("\n") + + # We evaluate the model after variance thresholding + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the test set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance after variance thresholding with 3 year deadline and {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + # We now perform feature selection based on the correlation with other features + corr_matrix = x_train.corr() + columns = corr_matrix.columns + columns_to_drop_corr_feat = [] + for i in range(len(columns)): + for j in range(i + 1, len(columns)): + if corr_matrix.loc[columns[i], columns[j]] > 0.80: + columns_to_drop_corr_feat.append(columns[j]) + columns_to_drop_corr_feat = set(columns_to_drop_corr_feat) + print("Number of features after correlation thresholding:", x_train.shape[1]-len(columns_to_drop_corr_feat)) + print("Number of features removed by correlation thresholding:", len(columns_to_drop_corr_feat)) + print("\n") + + # # Remove the columns + x_train = x_train.drop(columns = list(columns_to_drop_corr_feat)) + x_test = x_test.drop(columns = list(columns_to_drop_corr_feat)) + + # We evaluate the model after feature selection based on correlation + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the test set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance after feature selection based on correlation (3-year deadline) with {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + # We now perform feature selection based on the correlation with the target variable + corr_matrix = x_train.corrwith(y_train) + columns = corr_matrix.index + columns_to_drop_corr_target = [] + for i in range(len(columns)): + if abs(corr_matrix.iloc[i]) < 0.1: + columns_to_drop_corr_target.append(columns[i]) + columns_to_drop_corr_target = set(columns_to_drop_corr_target) + print("Number of features after correlation with target thresholding:", x_train.shape[1]-len(columns_to_drop_corr_target)) + print("Number of features removed by correlation with target thresholding:", len(columns_to_drop_corr_target)) + print("\n") + + # Remove the columns + x_train = x_train.drop(columns = list(columns_to_drop_corr_target)) + x_test = x_test.drop(columns = list(columns_to_drop_corr_target)) + + # Print the final features of the model + print("Final features of the model:") + print(list(x_train.columns)) + print("\n") + + # We evaluate the model after feature selection based on correlation with the target variable + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the test set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance after feature selection based on correlation with target (3-year deadline) with {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + # Print confusion matrix + print("Confusion matrix:") + tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel() + print("TN:", tn) + print("FP:", fp) + print("FN:", fn) + print("TP:", tp) + print("\n") + + + ################################################################################################ + ######## THIRD SCENARIO : MODEL FOR PREDICTION OF SURVIVAL FOR ALL PATIENTS #################### + ################################################################################################ + + print(" ------------- Model for prediction of survival for all patients -------------") + + # We don't remove any patient + data_all = data_grouped + + # Split into features and target but keep 'relapse' as a feature + y = data_all[['DC', 'train_test']] + x = data_all.drop(columns=['DC', 'delai_fin_DC', 'subject_id', 'site']) + + # In this case, because we are only interested in the prediction of survival, we extract only the 'DC' + y = y[['DC', 'train_test']] + # We replace all nan values by 0 in 'DC' + y.fillna(0, inplace=True) + + # Describe x and y + print('Feature data shape:', x.shape) + print('Target data shape:', y.shape) + print("Number of subjects which died:", y[y['DC']==1].shape[0]) + + # Split the data into training and testing sets based on column 'train_test' + x_train = x[x['train_test'] == 'train'].drop(columns=['train_test']) + x_test = x[x['train_test'] == 'test'].drop(columns=['train_test']) + y_train = y[y['train_test'] == 'train'].drop(columns=['train_test']) + y_test = y[y['train_test'] == 'test'].drop(columns=['train_test']) + # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) + print("\nNumber of subject for training:", x_train.shape[0]) + print("Number of subject for testing:", x_test.shape[0]) + print("\n") + + # Plot the distribution of 'delai_fin_DC' + data_all['delai_fin_DC'].hist() + plt.title('Distribution of survival time between end of treatment and\n death (in days) for those that died') + plt.xlabel('Survival time') + plt.ylabel('Number of subjects') + # plt.show() + + # Now we consider that every person that die after 3 year is considered as not dead (using the delai_fin_DC column) + y_deadline_3_year = data_all[['delai_fin_DC', 'train_test']] + y_deadline_3_year.fillna(1096, inplace=True) + y_deadline_3_year['delai_fin_DC'] = y_deadline_3_year['delai_fin_DC'].apply(lambda x: 0 if x > 1095 else 1) + print("Number of subjects that died within 3 year:", y_deadline_3_year[y_deadline_3_year['delai_fin_DC'] == 1].shape[0]) + + # Split the data into training and testing sets + y_train = y_deadline_3_year[y_deadline_3_year['train_test'] == 'train'].drop(columns=['train_test']) + y_test = y_deadline_3_year[y_deadline_3_year['train_test'] == 'test'].drop(columns=['train_test']) + print("\nNumber of subjects that died within 3 year (train):", y_train[y_train['delai_fin_DC'] == 1].shape[0]) + print("Number of subjects that died within 3 year (test):", y_test[y_test['delai_fin_DC'] == 1].shape[0]) + + # Initialise the model + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the training set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"\nModel performance on the deadline of 3 year with {x.shape[1]} features") + print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score:",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + # We now perform feature selection + + # Now we move on to training the model with no deadline with fewer features based on what we explored in file `data_preprocessing/5_eliminating_radiomics_features.py` + # We keep the clinical features + clinical_features = ['sexe', 'age', 'BMI', 'score_charlson', 'OMS', 'tabac', 'tabac_PA', 'tabac_sevre', 'histo', 'T', 'centrale', 'dose_tot', 'etalement', + 'vol_GTV', 'vol_PTV', 'vol_ITV', 'couv_PTV', 'BED_10', 'relapse'] + # We keep some radiomics features: selected here: + radiomics_features = ['INTENSITY-BASED_MeanIntensity', 'INTENSITY-BASED_IntensitySkewness', 'INTENSITY-BASED_IntensityKurtosis', 'INTENSITY-BASED_10thIntensityPercentile', + 'INTENSITY-BASED_AreaUnderCurveCIVH', 'INTENSITY-BASED_RootMeanSquareIntensity', 'INTENSITY-HISTOGRAM_IntensityHistogramMean', + 'INTENSITY-HISTOGRAM_IntensityHistogramVariance', 'NGTDM_Complexity', 'NGTDM_Strength'] + # Join the two lists + x = data_all[clinical_features + radiomics_features + ['train_test']] + y = data_all[['train_test', 'delai_fin_DC']] + # We replace all nan values by 0 in 'DC' + y.fillna(0, inplace=True) + + # In column 'delai_fin_DC', we replace all nan values by 1096 (3 years) + y['delai_fin_DC'].fillna(1096, inplace=True) + y['delai_fin_DC'] = y['delai_fin_DC'].apply(lambda x: 0 if x > 1095 else 1) + + # Renane column 'delai_fin_DC' to 'DC' + y = y.rename(columns={'delai_fin_DC': 'DC'}) + + # Split the data into training and testing sets + x_train = x[x['train_test'] == 'train'].drop(columns=['train_test']) + x_test = x[x['train_test'] == 'test'].drop(columns=['train_test']) + y_train = y[y['train_test'] == 'train'].drop(columns=['train_test']) + y_train = y_train['DC'] + y_test = y[y['train_test'] == 'test'].drop(columns=['train_test']) + y_test = y_test['DC'] + print("\n --- Feature selection ---") + print("Initial number of features:", x_train.shape[1]) + print("Number of subject for training:", x_train.shape[0]) + print("Number of subject for testing:", x_test.shape[0]) + print("\n") + + # Initialise the model + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the training set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance with 3 year deadline with {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + #Now we perform variance thresholding + shape_ini = x_train.shape + selector = VarianceThreshold(threshold=0.3) + selector.fit(x_train) + # Extract the names of the columns to remove + columns_to_remove_var_thresh = x_train.columns[~selector.get_support()] + # We remove these columns from the data + x_train = x_train.drop(columns=columns_to_remove_var_thresh) + x_test = x_test.drop(columns=columns_to_remove_var_thresh) + print("Number of features after variance thresholding:", x_train.shape[1]) + print("Number of features removed by variance thresholding:", shape_ini[1]-x_train.shape[1]) + print("\n") + + # We evaluate the model after variance thresholding + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the test set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance after variance thresholding with 3 year deadline and {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + # We now perform feature selection based on the correlation with other features + corr_matrix = x_train.corr() + columns = corr_matrix.columns + columns_to_drop_corr_feat = [] + for i in range(len(columns)): + for j in range(i + 1, len(columns)): + if corr_matrix.loc[columns[i], columns[j]] > 0.80: + columns_to_drop_corr_feat.append(columns[j]) + columns_to_drop_corr_feat = set(columns_to_drop_corr_feat) + print("Number of features after correlation thresholding:", x_train.shape[1]-len(columns_to_drop_corr_feat)) + print("Number of features removed by correlation thresholding:", len(columns_to_drop_corr_feat)) + print("\n") + + # # Remove the columns + x_train = x_train.drop(columns = list(columns_to_drop_corr_feat)) + x_test = x_test.drop(columns = list(columns_to_drop_corr_feat)) + + # We evaluate the model after feature selection based on correlation + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the test set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance after feature selection based on correlation (3-year deadline) with {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + print("\n") + + # We now perform feature selection based on the correlation with the target variable + corr_matrix = x_train.corrwith(y_train) + columns = corr_matrix.index + columns_to_drop_corr_target = [] + for i in range(len(columns)): + if abs(corr_matrix.iloc[i]) < 0.06: + columns_to_drop_corr_target.append(columns[i]) + columns_to_drop_corr_target = set(columns_to_drop_corr_target) + print("Number of features after correlation with target thresholding:", x_train.shape[1]-len(columns_to_drop_corr_target)) + print("Number of features removed by correlation with target thresholding:", len(columns_to_drop_corr_target)) + print("\n") + + # Remove the columns + x_train = x_train.drop(columns = list(columns_to_drop_corr_target)) + x_test = x_test.drop(columns = list(columns_to_drop_corr_target)) + + # Print the final features of the model + print("Final features of the model:") + print(list(x_train.columns)) + print("\n") + + # We evaluate the model after feature selection based on correlation with the target variable + model = XGBClassifier(seed=42) + model.fit(x_train, y_train) + + # Performance on the test set + y_test_pred = model.predict(x_test) + y_test_proba = model.predict_proba(x_test)[:, 1] + + # Compute ROC-AUC, accuracy score, Brier score and PR-AUC score + print(f"Model performance after feature selection based on correlation with target (3-year deadline) with {x_train.shape[1]} features") + print("ROC AUC Score: ", roc_auc_score(y_test, y_test_proba)) + print("Brier score:", brier_score_loss(y_test, y_test_proba)) + print("Average precision:", precision_score(y_test, y_test_pred)) + print("Average Recall:", recall_score(y_test, y_test_pred)) + print("Accuracy Score: ",accuracy_score(y_test, y_test_pred)) + precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred) + print("AUC-PR score:", auc(recall_test, precision_test)) + # Print confusion matrix + print("Confusion matrix:") + tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel() + print("TN:", tn) + print("FP:", fp) + print("FN:", fn) + print("TP:", tp) + print("\n") + + return None + + +if __name__ == '__main__': + main() \ No newline at end of file