Merge pull request #3 from systemallica/dev

faster computation times + printable models + publish package
systemallica · May 15, 2020 · d34695f · d34695f
2 parents c3e2855 + 3f8d352
commit d34695f
Show file tree

Hide file tree

Showing 8 changed files with 145 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -8,12 +8,24 @@ of classification tasks and regression tasks, simultaneously. Morfist's mixed mu
 * [Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32](https://link.springer.com/article/10.1023%2FA%3A1010933404324).
 * [Linusson, H. (2013). Multi-output random forests](https://pdfs.semanticscholar.org/4219/f87ed41c558d43cf78f63976cf87bcd7ebb0.pdf).
 
+## Installation
+
+With pip:
+```
+pip install decision-tree-morfist
+```
+With conda:
+```
+conda install -c systemallica decision-tree-morfist
+```
 ## Usage
 
 ### Initialising the model
 
 - Similarly to a scikit-learn [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html), a MixedRandomForest can be initialised in this way:
 ```
+from morfist import MixedRandomForest
+
 mrf = MixedRandomForest(
     n_estimators=n_trees,
     min_samples_leaf=1,

diff --git a/morfist/core.py b/morfist/core.py
@@ -1,7 +1,58 @@
 import numpy as np
 import scipy.stats
 import copy
-from fast_histogram import histogram1d
+from numba import njit
+
+
+# Calculate the impurity value for the classification task
+@njit
+def impurity_classification(y_classification):
+    # Cast to integer
+    y_class = y_classification.astype(np.int16)
+
+    # Calculate frequencies
+    frequency = np.bincount(y_class) / y_class.size
+
+    result = 0
+    for i in range(frequency.size):
+        if frequency[i]:
+            result += frequency[i] * np.log2(frequency[i])
+
+    return 0 - result
+
+
+# Calculate the impurity value for the regression task
+@njit
+def impurity_regression(y, y_regression):
+    if np.unique(y_regression).size < 2:
+        return 0
+
+    n_bins = 100
+    bin_width = (y.max() - y.min()) / n_bins
+
+    frequency = np.histogram(y_regression, n_bins)[0]
+    frequency_float = frequency.astype(np.float64)
+    frequency_float = (frequency_float / len(y)) / bin_width
+
+    probability = (frequency_float + 1) / (frequency_float.sum() + n_bins)
+
+    return 0 - bin_width * (probability * np.log2(probability)).sum()
+
+
+@njit
+def unique(x, feature):
+    return np.unique(x[:, feature])
+
+
+@njit
+def get_gain(imp_n_left, imp_n_right, imp_n, imp_root, n_left, n_right, n_parent):
+    impurity_left = imp_n_left / imp_root
+    impurity_right = imp_n_right / imp_root
+    impurity_parent = imp_n / imp_root
+
+    gain_left = (n_left / n_parent) * (impurity_parent - impurity_left)
+    gain_right = (n_right / n_parent) * (impurity_parent - impurity_right)
+    return gain_left + gain_right
 
 
 # Class in charge of finding the best split at every given moment
@@ -64,92 +115,70 @@ def __find_best_split(self, x, y):
         # Try each of the selected features and find which of them gives the best split(higher impurity)
         for feature in try_features:
             # Get the unique possible values for this particular feature
-            values = np.unique(x[:, feature])
+            values = unique(x, feature)
 
             # We ensure that there are at least 2 different values
             if values.size < 2:
                 continue
 
             # Random value sub-sampling
+            # Reduces the size by one element
+            # This is to avoid using the first value in case it is 0 for regression
+            # [0] -> ([0] + [1]) / 2
             values = (values[:-1] + values[1:]) / 2
+
+            # Choose a random amount of values, with a min of 2
             values = np.random.choice(values, min(2, values.size))
 
             # Try to split with this specific combination of feature and values
+            # Here lies the computational burden, as we try every possible split
+            # TODO incrementally compute impurity
             for value in values:
-                impurity = self.__try_split(x, y, feature, value)
+
+                left_idx = x[:, feature] <= value
+                right_idx = x[:, feature] > value
+
+                impurity = self.__impurity_split(y, y[left_idx, :], y[right_idx, :])
                 # If it's better than the previous saved one, save the values
                 if impurity > best_impurity:
                     best_feature, best_value, best_impurity = feature, value, impurity
 
         return best_feature, best_value, best_impurity
 
-    # Try a specific split
-    # Parameters
-    #   x: x data
-    #   y: y data
-    #   f: feature
-    #   t: value
-    def __try_split(self, x, y, feature, value):
-        left_idx = x[:, feature] <= value
-        right_idx = x[:, feature] > value
-
-        return self.__impurity_split(y, y[left_idx, :], y[right_idx, :])
-
-    # Calculate the impurity of a node
-    def __impurity_node(self, y):
-        # Calculate the impurity value for the classification task
-        def impurity_classification(y_classification):
-            # FIXME: this is one of the bottlenecks
-            y_classification = y_classification.astype(int)
-            frequency = np.bincount(y_classification) / y_classification.size
-            frequency = frequency[frequency != 0]
-            return 0 - np.array([f * np.log2(f) for f in frequency]).sum()
-
-        # Calculate the impurity value for the regression task
-        def impurity_regression(y_regression):
-            if np.unique(y_regression).size < 2:
-                return 0
-
-            n_bins = 100
-            histogram = histogram1d(y, bins=n_bins, range=(y.min(), y.max()))
-            frequency = histogram / len(y)
-            probability = (frequency + 1) / (frequency.sum() + n_bins)
-            bin_width = (y_regression.max() - y_regression.min()) / n_bins
-
-            return 0 - bin_width * (probability * np.log2(probability)).sum()
-
-        delta = 0.0001
-        impurity = np.zeros(self.n_targets)
-        # Calculate the impurity value for each of the targets(classification or regression)
-        for i in range(self.n_targets):
-            if i in self.classification_targets:
-                impurity[i] = impurity_classification(y[:, i]) + delta
-            else:
-                impurity[i] = impurity_regression(y[:, i]) + delta
-        return impurity
-
     # Calculate the impurity of a split
-    def __impurity_split(self, y, y_left, y_right):
-        n_parent = y.shape[0]
+    def __impurity_split(self, y_parent, y_left, y_right):
         n_left = y_left.shape[0]
         n_right = y_right.shape[0]
-
         if n_left < self.min_samples_leaf or n_right < self.min_samples_leaf:
             return np.inf
         else:
-            impurity_left = self.__impurity_node(y_left) / self.root_impurity
-            impurity_right = self.__impurity_node(y_right) / self.root_impurity
-            impurity_parent = self.__impurity_node(y) / self.root_impurity
+            n_parent = y_parent.shape[0]
 
-            gain_left = (n_left / n_parent) * (impurity_parent - impurity_left)
-            gain_right = (n_right / n_parent) * (impurity_parent - impurity_right)
-            gain = gain_left + gain_right
+            gain = get_gain(self.__impurity_node(y_left),
+                            self.__impurity_node(y_right),
+                            self.__impurity_node(y_parent),
+                            self.root_impurity,
+                            n_left,
+                            n_right,
+                            n_parent)
 
             if self.choose_split == 'mean':
                 return gain.mean()
             else:
                 return gain.max()
 
+    # Calculate the impurity of a node
+    def __impurity_node(self, y):
+        delta = 0.0001
+        impurity = np.zeros(self.n_targets)
+        # Calculate the impurity value for each of the targets(classification or regression)
+        for i in range(self.n_targets):
+            if i in self.classification_targets:
+                impurity[i] = impurity_classification(y[:, i]) + delta
+            else:
+                impurity[i] = impurity_regression(y, y[:, i]) + delta
+        return impurity
+
 
 # Build a Random Tree
 # Parameters:
@@ -211,16 +240,15 @@ def fit(self, x, y):
             if feature:
                 left_children.append(i + len(split_queue) + 1)
                 right_children.append(i + len(split_queue) + 2)
-            else:
-                left_children.append(None)
-                right_children.append(None)
 
-            if feature:
                 l_idx = next_x[:, feature] <= value
                 r_idx = next_x[:, feature] > value
 
                 split_queue.append((next_x[l_idx, :], next_y[l_idx, :]))
                 split_queue.append((next_x[r_idx, :], next_y[r_idx, :]))
+            else:
+                left_children.append(None)
+                right_children.append(None)
 
             i += 1
 
@@ -280,6 +308,19 @@ def print_level(level, i):
 #   choose_split: method used to find the best split
 #   classification_targets: features that are part of the classification task
 class MixedRandomForest:
+    def __str__(self):
+        params = "("
+        i = 0
+        for key in self.__dict__:
+            if i == 0:
+                params += str(key) + "=" + str(self.__dict__[key]) + ", \n"
+            elif i == len(self.__dict__) - 1:
+                params += "\t\t\t" + str(key) + "=" + str(self.__dict__[key]) + ")"
+            else:
+                params += "\t\t\t" + str(key) + "=" + str(self.__dict__[key]) + ", \n"
+            i += 1
+        return self.__class__.__name__ + params
+
     def __init__(self,
                  n_estimators=10,
                  max_features='sqrt',

diff --git a/setup.py b/setup.py
@@ -0,0 +1,28 @@
+import pathlib
+from setuptools import setup
+
+# The directory containing this file
+HERE = pathlib.Path(__file__).parent
+
+# The text of the README file
+README = (HERE / "README.md").read_text()
+
+# This call to setup() does all the work
+setup(
+    name="decision-tree-morfist",
+    version="0.1.1",
+    description="Multi-target Random Forest implementation that can mix both classification and regression tasks.",
+    long_description=README,
+    long_description_content_type="text/markdown",
+    url="https://github.com/systemallica/morfist",
+    author="Andrés Reverón Molina",
+    author_email="andres@reveronmolina.me",
+    license="MIT",
+    classifiers=[
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+    ],
+    packages=["morfist"],
+    install_requires=["numpy", "numba", "scipy"],
+)
diff --git a/test.cprof b/test.cprof
diff --git a/test/test.py b/test/test.py
@@ -67,7 +67,7 @@ def test_reg():
     t1_start = perf_counter()
     reg_morfist = MixedRandomForest(
         n_estimators=n_trees,
-        min_samples_leaf=5
+        min_samples_leaf=1
     )
 
     # Calculate morfist scores using cross-validation
@@ -110,7 +110,7 @@ def test_mix_1():
 
     mix_rf = MixedRandomForest(
         n_estimators=n_trees,
-        min_samples_leaf=5,
+        min_samples_leaf=1,
         classification_targets=[1]
     )
 
@@ -136,7 +136,7 @@ def test_mix_2():
 
     mix_rf = MixedRandomForest(
         n_estimators=n_trees,
-        min_samples_leaf=5,
+        min_samples_leaf=1,
         classification_targets=[0]
     )
 

diff --git a/test_new.cprof b/test_new.cprof
diff --git a/time_profile.png b/time_profile.png
diff --git a/time_profile_new.png b/time_profile_new.png