-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_shap.py
77 lines (67 loc) · 2.76 KB
/
run_shap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import shap
import autokeras as ak
import json
import numpy as np
from tensorflow.keras.models import load_model
import pandas as pd
import matplotlib.pyplot as plt
repo_list = [
"apache",
"jenkins",
"jira",
"spring",
"talendforge"
]
task_list = [
"productivity",
"quality_impact"
]
feature_list = [
"all",
"no_auto",
"no_manual"
]
shap.initjs()
for repo in repo_list:
for task in task_list:
for feature in feature_list:
# Load the best model
best_model_name = json.load(open("<best_model_config_json>".format(task)))[repo]
best_model = load_model("<best_model_path>".format(repo, task, best_model_name), custom_objects=ak.CUSTOM_OBJECTS)
train_df = pd.read_csv("<train_data>".format(task, repo, "_".join(best_model_name.split("_")[:-1])))
train_df = train_df.drop(['productivity', 'quality_impact'], axis=1)
print(train_df.shape)
cols = train_df.columns.tolist()
# Get lists of different types of features
manual_cols = [c for c in cols if not c.startswith('i_text_') and not c.startswith('d_rnn_feats_')]
auto_cols = [c for c in cols if c.startswith('i_text_') or c.startswith('d_rnn_feats_')]
manual_index = [cols.index(c) for c in manual_cols]
auto_index = [cols.index(c) for c in auto_cols]
# Select the features to use
if feature == "no_auto":
current_cols = manual_cols
current_index = manual_index
elif feature == "no_manual":
current_cols = auto_cols
current_index = auto_index
else:
current_cols = cols
current_index = list(range(len(cols)))
# print(current_cols)
X_train = train_df.values
# Create a summary of the training data
X_train_summary = shap.kmeans(X_train, 10)
# Load the test data
test_df = pd.read_csv("<test_data>".format(task, repo, "_".join(best_model_name.split("_")[:-1])))
test_df = test_df.drop(['productivity', 'quality_impact'], axis=1)
print(test_df.shape)
X_test = test_df.values
try:
shap_values = np.load("<shap_values>".format(repo, task), allow_pickle=True)
except:
# Create the SHAP values
kernel_explainer = shap.KernelExplainer(best_model.predict, X_train_summary)
shap_values = kernel_explainer.shap_values(X_test)[0]
# Save the SHAP values
np.save("<shap_values>".format(repo, task), shap_values)
print("shap_values shape: {}".format(shap_values.shape))