From abdc697b7ed655fe24401c35cb3c58c3961f865a Mon Sep 17 00:00:00 2001 From: Matthew Epland Date: Fri, 3 Mar 2023 12:15:01 -0500 Subject: [PATCH 1/6] Add different display types for leaf_distributions x-axis. (#254) * Added xaxis_display_type for ctree_leaf_distributions * added y_sorted option * remove commented out sort * Standardized x-axis title to Leaf IDs * fix for multiclass * added show_leaf_id_list * Added show_leaf_filter * updated doc string --- dtreeviz/trees.py | 69 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/dtreeviz/trees.py b/dtreeviz/trees.py index 1c2cddf..f86c590 100644 --- a/dtreeviz/trees.py +++ b/dtreeviz/trees.py @@ -1,6 +1,6 @@ import os import tempfile -from typing import Mapping, List +from typing import Mapping, List, Callable import matplotlib.patches as patches import matplotlib.pyplot as plt @@ -134,6 +134,9 @@ def leaf_sizes(self, def ctree_leaf_distributions(self, display_type: ("plot", "text") = "plot", + xaxis_display_type: str = "individual", + show_leaf_id_list: list = None, + show_leaf_filter: Callable[[np.ndarray], bool] = None, plot_ylim: int = None, colors: dict = None, fontsize: int = 10, @@ -156,6 +159,16 @@ def ctree_leaf_distributions(self, :param display_type: str, optional 'plot' or 'text' + :param xaxis_display_type: str, optional + 'individual': Displays every node ID individually + 'auto': Let matplotlib automatically manage the node ID ticks + 'y_sorted': Display in y order with no x-axis tick labels + :param show_leaf_id_list: list, optional + The allowed list of node id values to plot + :param show_leaf_filter: Callable[[np.ndarray], bool], optional + The filtering function to apply to leaf values before displaying the leaves. + The function is applied to a numpy array with the class i sample value in row i. + For example, to view only those leaves with more than 100 total samples, and more than 5 class 1 samples, use show_leaf_filter = lambda x: (100 < np.sum(x)) & (5 < x[1]) :param plot_ylim: int, optional The max value for oY. This is useful in case we have few leaves with big sample values which 'shadow' the other leaves values. @@ -181,26 +194,54 @@ def ctree_leaf_distributions(self, else: fig, ax = plt.subplots() - ax.set_xticks(range(0, len(index))) - ax.set_xticklabels(index) - if plot_ylim is not None: - ax.set_ylim(0, plot_ylim) - leaf_samples_hist = [[] for i in range(self.shadow_tree.nclasses())] for leaf_sample in leaf_samples: for i, leaf_count in enumerate(leaf_sample): leaf_samples_hist[i].append(leaf_count) + leaf_samples_hist = np.array(leaf_samples_hist) + + if show_leaf_id_list is not None: + _mask = np.isin(index, show_leaf_id_list) + leaf_samples_hist = leaf_samples_hist[:, _mask] + index = tuple(np.array(index)[_mask]) + if show_leaf_filter is not None: + _mask = np.apply_along_axis(show_leaf_filter, 0, leaf_samples_hist) + leaf_samples_hist = leaf_samples_hist[:, _mask] + index = tuple(np.array(index)[_mask]) + + if xaxis_display_type == 'individual': + x = np.arange(0, len(index)) + ax.set_xticks(x) + ax.set_xticklabels(index) + elif xaxis_display_type == 'auto': + x = np.array(index) + ax.set_xlim(np.min(x)-1, np.max(x)+1) + elif xaxis_display_type == 'y_sorted': + # sort by total y = sum(classes), then class 0, 1, 2, ... + sort_cols = [np.sum(leaf_samples_hist, axis=0)] + for i in range(leaf_samples_hist.shape[0]): + sort_cols.append(leaf_samples_hist[i]) + _sort = np.lexsort(sort_cols[::-1])[::-1] + leaf_samples_hist = leaf_samples_hist[:, _sort] + index = tuple(np.array(index)[_sort]) + + x = np.arange(0, len(index)) + ax.set_xticks(x) + ax.set_xticklabels([]) + ax.tick_params(axis='x', which='both', bottom=False) + else: + raise ValueError(f'Unknown xaxis_display_type = {xaxis_display_type}!') + + if plot_ylim is not None: + ax.set_ylim(0, plot_ylim) - bar_containers = [] - bottom_values = np.full(len(index), 0) - for i, leaf_sample in enumerate(leaf_samples_hist): - bar_container = ax.bar(range(0, len(index)), leaf_sample, bottom=bottom_values, + bottom_values = np.zeros(len(index)) + for i in range(leaf_samples_hist.shape[0]): + bar_container = ax.bar(x, leaf_samples_hist[i], bottom=bottom_values, color=colors_classes[i], lw=.3, align='center', width=1) - bottom_values = bottom_values + np.array(leaf_sample) - bar_containers.append(bar_container) + bottom_values = bottom_values + leaf_samples_hist[i] - for bar_container in bar_containers: for rect in bar_container.patches: rect.set_linewidth(.5) rect.set_edgecolor(colors['rect_edge']) @@ -884,7 +925,7 @@ def rtree_leaf_distributions(self, for i in range(len(means)): ax.plot(means[i], means_range[i], color=colors['split_line'], linewidth=prediction_line_width) - _format_axes(ax, self.shadow_tree.target_name, "Leaf", colors, fontsize=label_fontsize, fontname=fontname, ticks_fontsize=None, grid=grid) + _format_axes(ax, self.shadow_tree.target_name, "Leaf IDs", colors, fontsize=label_fontsize, fontname=fontname, ticks_fontsize=None, grid=grid) def ctree_feature_space(self, fontsize=10, From 1e2e952723432a612bc97f9b6f1917f1d8bb9ae1 Mon Sep 17 00:00:00 2001 From: Tudor Lapusan Date: Sat, 18 Mar 2023 20:07:18 +0200 Subject: [PATCH 2/6] 276-Include string columns in node stats (#277) --- dtreeviz/trees.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dtreeviz/trees.py b/dtreeviz/trees.py index f86c590..1f3835b 100644 --- a/dtreeviz/trees.py +++ b/dtreeviz/trees.py @@ -803,8 +803,8 @@ def node_stats(self, node_id: int) -> pd.DataFrame: """ node_samples = self.shadow_tree.get_node_samples() - df = pd.DataFrame(self.shadow_tree.X_train, columns=self.shadow_tree.feature_names) - return df.iloc[node_samples[node_id]].describe() + df = pd.DataFrame(self.shadow_tree.X_train, columns=self.shadow_tree.feature_names).convert_dtypes() + return df.iloc[node_samples[node_id]].describe(include='all') def instance_feature_importance(self, x, colors: dict = None, From 89255ded4b4448522447f5efe2255a127a59ebd6 Mon Sep 17 00:00:00 2001 From: Terence Parr Date: Tue, 7 Mar 2023 09:54:30 -0800 Subject: [PATCH 3/6] add tf link --- README.md | 2 +- notebooks/dtreeviz_tensorflow_visualisations.ipynb | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 95a5466..593920f 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ See [Installation instructions](README.md#Installation) then take a look at the * [sklearn-based examples](notebooks/dtreeviz_sklearn_visualisations.ipynb) ([colab](https://colab.research.google.com/github/parrt/dtreeviz/blob/master/notebooks/dtreeviz_sklearn_visualisations.ipynb)) * [LightGBM-based examples](notebooks/dtreeviz_lightgbm_visualisations.ipynb) ([colab](https://colab.research.google.com/github/parrt/dtreeviz/blob/master/notebooks/dtreeviz_lightgbm_visualisations.ipynb)) * [Spark-based examples](notebooks/dtreeviz_spark_visualisations.ipynb) ([colab](https://colab.research.google.com/github/parrt/dtreeviz/blob/master/notebooks/dtreeviz_spark_visualisations.ipynb)) -* [TensorFlow-based examples](notebooks/dtreeviz_tensorflow_visualisations.ipynb) ([colab](https://colab.research.google.com/github/parrt/dtreeviz/blob/master/notebooks/dtreeviz_tensorflow_visualisations.ipynb)) +* [TensorFlow-based examples](notebooks/dtreeviz_tensorflow_visualisations.ipynb) ([colab](https://colab.research.google.com/github/parrt/dtreeviz/blob/master/notebooks/dtreeviz_tensorflow_visualisations.ipynb)) Also see blog at tensorflow.org [Visualizing TensorFlow Decision Forest Trees with dtreeviz](https://www.tensorflow.org/decision_forests/tutorials/dtreeviz_colab) * [XGBoost-based examples](notebooks/dtreeviz_xgboost_visualisations.ipynb) ([colab](https://colab.research.google.com/github/parrt/dtreeviz/blob/master/notebooks/dtreeviz_xgboost_visualisations.ipynb)) * [Classifier decision boundaries for any scikit-learn model.ipynb](https://github.com/parrt/dtreeviz/tree/master/notebooks/classifier-decision-boundaries.ipynb) ([colab](https://colab.research.google.com/github/parrt/dtreeviz/blob/master/notebooks/classifier-decision-boundaries.ipynb)) * [Changing colors notebook](notebooks/colors.ipynb) ([colab](https://colab.research.google.com/github/parrt/dtreeviz/blob/master/notebooks/colors.ipynb)) diff --git a/notebooks/dtreeviz_tensorflow_visualisations.ipynb b/notebooks/dtreeviz_tensorflow_visualisations.ipynb index 81cd9c7..c6b7f18 100644 --- a/notebooks/dtreeviz_tensorflow_visualisations.ipynb +++ b/notebooks/dtreeviz_tensorflow_visualisations.ipynb @@ -9,6 +9,8 @@ "\n", "([View this notebook in Colab](https://colab.research.google.com/github/parrt/dtreeviz/blob/master/notebooks/dtreeviz_tensorflow_visualisations.ipynb))\n", "\n", + "See also the blog at tensorflow.org [Visualizing TensorFlow Decision Forest Trees with dtreeviz](https://www.tensorflow.org/decision_forests/tutorials/dtreeviz_colab)\n", + "\n", "The [dtreeviz](https://github.com/parrt/dtreeviz) library is designed to help machine learning practitioners visualize and interpret decision trees and decision-tree-based models, such as gradient boosting machines. \n", "\n", "The purpose of this notebook is to illustrate the main capabilities and functions of the dtreeviz API. To do that, we will use TensorFlow Decision Forests and the toy but well-known Titanic data set for illustrative purposes. Currently, dtreeviz supports the following decision tree libraries:\n", From 8c15cccc5ceb2cb497f3c30fe5a5320666974ebf Mon Sep 17 00:00:00 2001 From: "tudor.lapusan" Date: Sun, 2 Apr 2023 12:43:04 +0300 Subject: [PATCH 4/6] Support visualisations for nodes with samples from only one class. --- dtreeviz/trees.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/dtreeviz/trees.py b/dtreeviz/trees.py index 1f3835b..7f96ee4 100644 --- a/dtreeviz/trees.py +++ b/dtreeviz/trees.py @@ -2,6 +2,7 @@ import tempfile from typing import Mapping, List, Callable +import matplotlib import matplotlib.patches as patches import matplotlib.pyplot as plt import numpy as np @@ -1183,12 +1184,22 @@ def _class_split_viz(node: ShadowDecTreeNode, histtype=histtype, bins=bins, label=class_names) + # Alter appearance of each bar - for patch in barcontainers: - for rect in patch.patches: + if isinstance(barcontainers[0], matplotlib.container.BarContainer): + for patch in barcontainers: + for rect in patch.patches: + rect.set_linewidth(.5) + rect.set_edgecolor(colors['rect_edge']) + ax.set_yticks([0, max([max(h) for h in hist])]) + elif isinstance(barcontainers[0], matplotlib.patches.Rectangle): + # In case a node will contains samples from only one class. + for rect in barcontainers.patches: rect.set_linewidth(.5) rect.set_edgecolor(colors['rect_edge']) - ax.set_yticks([0, max([max(h) for h in hist])]) + ax.set_yticks([0, max(hist)]) + + # set an empty space at the beginning and the end of the node visualisation for better clarity bin_length = bins[1] - bins[0] From 113ec46873275b072df029129b24c291db04afef Mon Sep 17 00:00:00 2001 From: "tudor.lapusan" Date: Sat, 18 Mar 2023 17:30:51 +0200 Subject: [PATCH 5/6] Make sklearn visualisations to support validation datasets. --- dtreeviz/models/sklearn_decision_trees.py | 13 ++++++++++++- dtreeviz/trees.py | 4 ++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/dtreeviz/models/sklearn_decision_trees.py b/dtreeviz/models/sklearn_decision_trees.py index 1d42b25..caf55e1 100644 --- a/dtreeviz/models/sklearn_decision_trees.py +++ b/dtreeviz/models/sklearn_decision_trees.py @@ -94,8 +94,19 @@ def get_node_feature(self, id) -> int: return self.tree_model.tree_.feature[id] def get_node_nsamples_by_class(self, id): + # This is the code to return the nsamples/class from tree metadata. It's faster, but the visualisations cannot + # be made on new datasets. + # if self.is_classifier(): + # return self.tree_model.tree_.value[id][0] + + # This code allows us to return the nsamples/class based on a dataset, train or validation if self.is_classifier(): - return self.tree_model.tree_.value[id][0] + all_nodes = self.internal + self.leaves + node_value = [node.n_sample_classes() for node in all_nodes if node.id == id] + if self.get_class_weights() is None: + return node_value[0] + else: + return node_value[0] * self.get_class_weights() def get_prediction(self, id): if self.is_classifier(): diff --git a/dtreeviz/trees.py b/dtreeviz/trees.py index 7f96ee4..8c5cec9 100644 --- a/dtreeviz/trees.py +++ b/dtreeviz/trees.py @@ -1252,6 +1252,10 @@ def _class_leaf_viz(node: ShadowDecTreeNode, counts = node.class_counts() prediction = node.prediction_name() + # when using another dataset than the training dataset, some leaves could have 0 samples. + # Trying to make a pie chart will raise some deprecation + if sum(counts) == 0: + return if leaftype == 'pie': _draw_piechart(counts, size=size, colors=colors, filename=filename, label=f"n={nsamples}\n{prediction}", graph_colors=graph_colors, fontname=fontname) From d42b2735970cc8a3a3015e6fa1dbd1af9e6028cb Mon Sep 17 00:00:00 2001 From: Alex Moldovan Date: Fri, 7 Apr 2023 16:06:24 +0100 Subject: [PATCH 6/6] Mapped compatibility calls - There was an issue where the colors passed were 1.0. Signed-off-by: Alex Moldovan --- dtreeviz/compatibility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dtreeviz/compatibility.py b/dtreeviz/compatibility.py index a2c7936..b24d4c5 100644 --- a/dtreeviz/compatibility.py +++ b/dtreeviz/compatibility.py @@ -255,7 +255,7 @@ def dtreeviz(tree_model, instance_orientation, show_root_edge_labels, show_node_labels, show_just_path, fancy, histtype, highlight_path, X, max_X_features_LR, max_X_features_TD, depth_range_to_display, label_fontsize, ticks_fontsize, - fontname, title, title_fontsize, colors, scale) + fontname, title, title_fontsize, colors=colors, scale=scale) def viz_leaf_samples(tree_model,