Skip to content

Commit

Permalink
Merge pull request #3 from systemallica/dev
Browse files Browse the repository at this point in the history
faster computation times + printable models + publish package
  • Loading branch information
systemallica authored May 15, 2020
2 parents c3e2855 + 3f8d352 commit d34695f
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 64 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,24 @@ of classification tasks and regression tasks, simultaneously. Morfist's mixed mu
* [Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32](https://link.springer.com/article/10.1023%2FA%3A1010933404324).
* [Linusson, H. (2013). Multi-output random forests](https://pdfs.semanticscholar.org/4219/f87ed41c558d43cf78f63976cf87bcd7ebb0.pdf).

## Installation

With pip:
```
pip install decision-tree-morfist
```
With conda:
```
conda install -c systemallica decision-tree-morfist
```
## Usage

### Initialising the model

- Similarly to a scikit-learn [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html), a MixedRandomForest can be initialised in this way:
```
from morfist import MixedRandomForest
mrf = MixedRandomForest(
n_estimators=n_trees,
min_samples_leaf=1,
Expand Down
163 changes: 102 additions & 61 deletions morfist/core.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,58 @@
import numpy as np
import scipy.stats
import copy
from fast_histogram import histogram1d
from numba import njit


# Calculate the impurity value for the classification task
@njit
def impurity_classification(y_classification):
# Cast to integer
y_class = y_classification.astype(np.int16)

# Calculate frequencies
frequency = np.bincount(y_class) / y_class.size

result = 0
for i in range(frequency.size):
if frequency[i]:
result += frequency[i] * np.log2(frequency[i])

return 0 - result


# Calculate the impurity value for the regression task
@njit
def impurity_regression(y, y_regression):
if np.unique(y_regression).size < 2:
return 0

n_bins = 100
bin_width = (y.max() - y.min()) / n_bins

frequency = np.histogram(y_regression, n_bins)[0]
frequency_float = frequency.astype(np.float64)
frequency_float = (frequency_float / len(y)) / bin_width

probability = (frequency_float + 1) / (frequency_float.sum() + n_bins)

return 0 - bin_width * (probability * np.log2(probability)).sum()


@njit
def unique(x, feature):
return np.unique(x[:, feature])


@njit
def get_gain(imp_n_left, imp_n_right, imp_n, imp_root, n_left, n_right, n_parent):
impurity_left = imp_n_left / imp_root
impurity_right = imp_n_right / imp_root
impurity_parent = imp_n / imp_root

gain_left = (n_left / n_parent) * (impurity_parent - impurity_left)
gain_right = (n_right / n_parent) * (impurity_parent - impurity_right)
return gain_left + gain_right


# Class in charge of finding the best split at every given moment
Expand Down Expand Up @@ -64,92 +115,70 @@ def __find_best_split(self, x, y):
# Try each of the selected features and find which of them gives the best split(higher impurity)
for feature in try_features:
# Get the unique possible values for this particular feature
values = np.unique(x[:, feature])
values = unique(x, feature)

# We ensure that there are at least 2 different values
if values.size < 2:
continue

# Random value sub-sampling
# Reduces the size by one element
# This is to avoid using the first value in case it is 0 for regression
# [0] -> ([0] + [1]) / 2
values = (values[:-1] + values[1:]) / 2

# Choose a random amount of values, with a min of 2
values = np.random.choice(values, min(2, values.size))

# Try to split with this specific combination of feature and values
# Here lies the computational burden, as we try every possible split
# TODO incrementally compute impurity
for value in values:
impurity = self.__try_split(x, y, feature, value)

left_idx = x[:, feature] <= value
right_idx = x[:, feature] > value

impurity = self.__impurity_split(y, y[left_idx, :], y[right_idx, :])
# If it's better than the previous saved one, save the values
if impurity > best_impurity:
best_feature, best_value, best_impurity = feature, value, impurity

return best_feature, best_value, best_impurity

# Try a specific split
# Parameters
# x: x data
# y: y data
# f: feature
# t: value
def __try_split(self, x, y, feature, value):
left_idx = x[:, feature] <= value
right_idx = x[:, feature] > value

return self.__impurity_split(y, y[left_idx, :], y[right_idx, :])

# Calculate the impurity of a node
def __impurity_node(self, y):
# Calculate the impurity value for the classification task
def impurity_classification(y_classification):
# FIXME: this is one of the bottlenecks
y_classification = y_classification.astype(int)
frequency = np.bincount(y_classification) / y_classification.size
frequency = frequency[frequency != 0]
return 0 - np.array([f * np.log2(f) for f in frequency]).sum()

# Calculate the impurity value for the regression task
def impurity_regression(y_regression):
if np.unique(y_regression).size < 2:
return 0

n_bins = 100
histogram = histogram1d(y, bins=n_bins, range=(y.min(), y.max()))
frequency = histogram / len(y)
probability = (frequency + 1) / (frequency.sum() + n_bins)
bin_width = (y_regression.max() - y_regression.min()) / n_bins

return 0 - bin_width * (probability * np.log2(probability)).sum()

delta = 0.0001
impurity = np.zeros(self.n_targets)
# Calculate the impurity value for each of the targets(classification or regression)
for i in range(self.n_targets):
if i in self.classification_targets:
impurity[i] = impurity_classification(y[:, i]) + delta
else:
impurity[i] = impurity_regression(y[:, i]) + delta
return impurity

# Calculate the impurity of a split
def __impurity_split(self, y, y_left, y_right):
n_parent = y.shape[0]
def __impurity_split(self, y_parent, y_left, y_right):
n_left = y_left.shape[0]
n_right = y_right.shape[0]

if n_left < self.min_samples_leaf or n_right < self.min_samples_leaf:
return np.inf
else:
impurity_left = self.__impurity_node(y_left) / self.root_impurity
impurity_right = self.__impurity_node(y_right) / self.root_impurity
impurity_parent = self.__impurity_node(y) / self.root_impurity
n_parent = y_parent.shape[0]

gain_left = (n_left / n_parent) * (impurity_parent - impurity_left)
gain_right = (n_right / n_parent) * (impurity_parent - impurity_right)
gain = gain_left + gain_right
gain = get_gain(self.__impurity_node(y_left),
self.__impurity_node(y_right),
self.__impurity_node(y_parent),
self.root_impurity,
n_left,
n_right,
n_parent)

if self.choose_split == 'mean':
return gain.mean()
else:
return gain.max()

# Calculate the impurity of a node
def __impurity_node(self, y):
delta = 0.0001
impurity = np.zeros(self.n_targets)
# Calculate the impurity value for each of the targets(classification or regression)
for i in range(self.n_targets):
if i in self.classification_targets:
impurity[i] = impurity_classification(y[:, i]) + delta
else:
impurity[i] = impurity_regression(y, y[:, i]) + delta
return impurity


# Build a Random Tree
# Parameters:
Expand Down Expand Up @@ -211,16 +240,15 @@ def fit(self, x, y):
if feature:
left_children.append(i + len(split_queue) + 1)
right_children.append(i + len(split_queue) + 2)
else:
left_children.append(None)
right_children.append(None)

if feature:
l_idx = next_x[:, feature] <= value
r_idx = next_x[:, feature] > value

split_queue.append((next_x[l_idx, :], next_y[l_idx, :]))
split_queue.append((next_x[r_idx, :], next_y[r_idx, :]))
else:
left_children.append(None)
right_children.append(None)

i += 1

Expand Down Expand Up @@ -280,6 +308,19 @@ def print_level(level, i):
# choose_split: method used to find the best split
# classification_targets: features that are part of the classification task
class MixedRandomForest:
def __str__(self):
params = "("
i = 0
for key in self.__dict__:
if i == 0:
params += str(key) + "=" + str(self.__dict__[key]) + ", \n"
elif i == len(self.__dict__) - 1:
params += "\t\t\t" + str(key) + "=" + str(self.__dict__[key]) + ")"
else:
params += "\t\t\t" + str(key) + "=" + str(self.__dict__[key]) + ", \n"
i += 1
return self.__class__.__name__ + params

def __init__(self,
n_estimators=10,
max_features='sqrt',
Expand Down
28 changes: 28 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pathlib
from setuptools import setup

# The directory containing this file
HERE = pathlib.Path(__file__).parent

# The text of the README file
README = (HERE / "README.md").read_text()

# This call to setup() does all the work
setup(
name="decision-tree-morfist",
version="0.1.1",
description="Multi-target Random Forest implementation that can mix both classification and regression tasks.",
long_description=README,
long_description_content_type="text/markdown",
url="https://github.com/systemallica/morfist",
author="Andrés Reverón Molina",
author_email="andres@reveronmolina.me",
license="MIT",
classifiers=[
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
],
packages=["morfist"],
install_requires=["numpy", "numba", "scipy"],
)
Binary file removed test.cprof
Binary file not shown.
6 changes: 3 additions & 3 deletions test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_reg():
t1_start = perf_counter()
reg_morfist = MixedRandomForest(
n_estimators=n_trees,
min_samples_leaf=5
min_samples_leaf=1
)

# Calculate morfist scores using cross-validation
Expand Down Expand Up @@ -110,7 +110,7 @@ def test_mix_1():

mix_rf = MixedRandomForest(
n_estimators=n_trees,
min_samples_leaf=5,
min_samples_leaf=1,
classification_targets=[1]
)

Expand All @@ -136,7 +136,7 @@ def test_mix_2():

mix_rf = MixedRandomForest(
n_estimators=n_trees,
min_samples_leaf=5,
min_samples_leaf=1,
classification_targets=[0]
)

Expand Down
Binary file removed test_new.cprof
Binary file not shown.
Binary file removed time_profile.png
Binary file not shown.
Binary file removed time_profile_new.png
Binary file not shown.

0 comments on commit d34695f

Please sign in to comment.