Merge pull request #91 from tlapusan/support_other_tree_models_#83

Support other tree models. Fixes #83
parrt · Jul 11, 2020 · 5a122cf · 5a122cf
2 parents b38b3cc + e40c03e
commit 5a122cf
Show file tree

Hide file tree

Showing 40 changed files with 122,696 additions and 95,443 deletions.
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,4 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+.idea/
diff --git a/dtreeviz/interpretation.py b/dtreeviz/interpretation.py
@@ -2,73 +2,77 @@
 Prediction path interpretation for decision tree models.
 In this moment, it contains "plain english" implementation, but others can be added in the future.
 """
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas
-from sklearn import tree
-import matplotlib.pyplot as plt
+
 from dtreeviz.colors import adjust_colors
+from dtreeviz.models.shadow_decision_tree import ShadowDecTree
 
 
-def explain_prediction_plain_english(tree_model: (tree.DecisionTreeClassifier, tree.DecisionTreeRegressor),
-                                     X: (pandas.core.series.Series, np.ndarray),
-                                     feature_names):
+def explain_prediction_plain_english(shadow_tree: ShadowDecTree,
+                                     x: (pandas.core.series.Series, np.ndarray)):
     """
     Explains the prediction path using feature value's range.
 
     A possible output for this method could be :
-        1.5 <= Pclass(3.0)
-        3.5 <= Age(29.7) < 44.5
-        7.91 <= Fare(8.05) < 54.25
-        0.5 <= Sex_label(1.0)
-        Cabin_label(-1.0) < 3.5
-        0.5 <= Embarked_label(2.0)
+        1.5 <= Pclass
+        3.5 <= Age < 44.5
+        7.91 <= Fare < 54.25
+        0.5 <= Sex_label
+        Cabin_label < 3.5
+        0.5 <= Embarked_label
     Output explanation :
         The model chose to make this prediction because instance's Pclass feature value is bigger or equal to 1.5, Age
         is between 3.5 and 44.5, Fare is between 7.91 and 54.25, and so on.
 
-    :param tree_model: tree used to make prediction
-    :param X: Instance example to make prediction
-    :param feature_names: feature name list
+    :param shadow_tree: tree used to make prediction
+    :param x: Instance example to make prediction
     :return: str
         Prediction path explanation in plain english.
     """
 
-    node_feature_index = tree_model.tree_.feature
-    node_threshold = tree_model.tree_.threshold
+    node_feature_index = shadow_tree.get_features()
+    feature_names = shadow_tree.feature_names
+    node_threshold = shadow_tree.get_thresholds()
+    prediction_value, decision_node_path = shadow_tree.predict(x)
 
-    node_indicator = tree_model.decision_path([X])
-    decision_node_path = node_indicator.indices[node_indicator.indptr[0]:
-                                                node_indicator.indptr[1]]
-    feature_min_range = {}
-    feature_max_range = {}
-    for i, node_id in enumerate(decision_node_path):
+    feature_smaller_values = {}
+    feature_bigger_values = {}
+    for i, node in enumerate(decision_node_path):
         if i == len(decision_node_path) - 1:
             break  # stop at leaf node
+        node_id = node.id
 
         feature_name = feature_names[node_feature_index[node_id]]
-        feature_value = X[node_feature_index[node_id]]
+        feature_value = x[node_feature_index[node_id]]
         feature_split_value = round(node_threshold[node_id], 2)
 
-        if feature_min_range.get(feature_name, feature_value) >= feature_split_value:
-            feature_min_range[feature_name] = feature_split_value
-        elif feature_max_range.get(feature_name, feature_value) < feature_split_value:
-            feature_max_range[feature_name] = feature_split_value
+        if feature_split_value <= feature_value:
+            if feature_smaller_values.get(feature_name) is None:
+                feature_smaller_values[feature_name] = []
+            feature_smaller_values.get(feature_name).append(feature_split_value)
+        elif feature_split_value > feature_value:
+            if feature_bigger_values.get(feature_name) is None:
+                feature_bigger_values[feature_name] = []
+            feature_bigger_values.get(feature_name).append(feature_split_value)
 
     for feature_name in feature_names:
         feature_range = ""
-        if feature_name in feature_min_range:
-            feature_range = f"{feature_min_range[feature_name]} <= {feature_name}"
-        if feature_name in feature_max_range:
+        if feature_name in feature_smaller_values:
+            feature_range = f"{max(feature_smaller_values[feature_name])} <= {feature_name} "
+        if feature_name in feature_bigger_values:
             if feature_range == "":
-                feature_range = f"{feature_name} < {feature_max_range[feature_name]}"
+                feature_range = f"{feature_name} < {min(feature_bigger_values[feature_name])}"
             else:
-                feature_range += f" < {feature_max_range[feature_name]}"
+                feature_range += f" < {min(feature_bigger_values[feature_name])}"
 
         if feature_range != "":
             print(feature_range)
 
 
-def explain_prediction_sklearn_default(tree_model, X, features,
+def explain_prediction_sklearn_default(shadow_tree: ShadowDecTree,
+                                       x: (pandas.core.series.Series, np.ndarray),
                                        figsize: tuple = (10, 5),
                                        colors: dict = None,
                                        fontsize: int = 14,
@@ -81,10 +85,8 @@ def explain_prediction_sklearn_default(tree_model, X, features,
     their number of categories.
     For more details, you can read this article : https://explained.ai/rf-importance/index.html
 
-    :param tree_model: tree used to make prediction
-    :param X: Instance example to make prediction
-    :param features: list
-        Feature name list
+    :param shadow_tree: tree used to make prediction
+    :param x: Instance example to make prediction
     :param figsize: tuple of int, optional
         The plot size
     :param colors: dict, optional
@@ -93,15 +95,18 @@ def explain_prediction_sklearn_default(tree_model, X, features,
         Plot labels fontsize
     :param fontname: str, optional
         Plot labels font name
+    :param grid: bool
+        True if we want to display the grid lines on the visualization
     :return:
         Prediction feature's importance plot
     """
 
-    node_indicator = tree_model.decision_path([X])
-    decision_node_path = node_indicator.indices[node_indicator.indptr[0]:
-                                                node_indicator.indptr[1]]
-    feature_path_importance = _get_feature_path_importance_sklearn(tree_model, decision_node_path)
-    return _get_feature_path_importance_sklearn_plot(features, feature_path_importance, figsize, colors, fontsize,
+    prediction_value, decision_node_path = shadow_tree.predict(x)
+    decision_node_path = [node.id for node in decision_node_path]
+
+    feature_path_importance = shadow_tree.get_feature_path_importance(decision_node_path)
+    return _get_feature_path_importance_sklearn_plot(shadow_tree.feature_names, feature_path_importance, figsize,
+                                                     colors, fontsize,
                                                      fontname,
                                                      grid)
 
@@ -130,26 +135,6 @@ def _get_feature_path_importance_sklearn_plot(features, feature_path_importance,
     return ax
 
 
-def _get_feature_path_importance_sklearn(tree_model, node_list):
-    gini_importance = np.zeros(tree_model.tree_.n_features)
-    for node in node_list:
-        if tree_model.tree_.children_left[node] != -1:
-            node_left = tree_model.tree_.children_left[node]
-            node_right = tree_model.tree_.children_right[node]
-
-            gini_importance[tree_model.tree_.feature[node]] += tree_model.tree_.weighted_n_node_samples[node] * \
-                                                               tree_model.tree_.impurity[node] \
-                                                               - tree_model.tree_.weighted_n_node_samples[node_left] * \
-                                                               tree_model.tree_.impurity[node_left] \
-                                                               - tree_model.tree_.weighted_n_node_samples[node_right] * \
-                                                               tree_model.tree_.impurity[node_right]
-    normalizer = np.sum(gini_importance)
-    if normalizer > 0.0:
-        gini_importance /= normalizer
-
-    return gini_importance
-
-
 def get_prediction_explainer(explanation_type: str):
     """Factory method responsible to return a prediction path implementation based on argument 'explanation_type'
 

diff --git a/dtreeviz/models/__init__.py b/dtreeviz/models/__init__.py