diff --git a/docs/methods/neighbors/decision_tree/decision_tree.md b/docs/methods/neighbors/decision_tree/decision_tree.md new file mode 100644 index 0000000..ab4ab02 --- /dev/null +++ b/docs/methods/neighbors/decision_tree/decision_tree.md @@ -0,0 +1,43 @@ +# Decision Tree + +Decision Tree model. + +Decision Tree forms the tree based on the best information gain we received at each node. The tree formed later use to predict the output values for test data. + + +## Attributes + +| Name | Definition | Shape | +| ------------ | --------------------------------------------------------- | ---------- | +| entropy | Entropy of the y values left | 1 | +| max depth | Max depth of the decision tree | 1 | +| max depth | Max depth of the decision tree | 1 | +| min Samples leaf | Minimum samples that should be there to form further nodes of branch | 1 | +| target | Output class | 1 | +| max features | Max features that should be considered to form further branches | 1 | +| features | All the features/columns in data | n_features | +| fkey | Column number that will be considered to form next node | 1 | +| fval | Double splitting value | 1 | +| left | Left node of decision tree | 1 | +| right | Right node of decision tree | 1 | + +## Methods + +| Name | Definition | Return value | +| ------------------------------- | ----------------------------------------------------- | ----------------- | +| `entropy(vector)` | Compute entropy | double | +| `divideData(vector>,int,int)` | Divide the data after finding best Info gain | `vector>>`| +| `infoGain(vector>,int,int)` | Computes info gain | double | +| `train(vector>)` | Train the model on training values | void | +| `predict(vector)` | Predict the output for testing value | int | + +## Example + +``` +std::vector> x_data{{0,23.76,3,76.56,1},{1,12.76,2,87.45,0},{1,21.86,1,79.98,1},{0,32.64,1,76.87,1},{0,22.76,3,89.90,0},{1,28.64,0,73.87,1},{0,12.87,3,82.86,0}}; +DecisionTree dt = new DecisionTree(9,0,4); +dt->train(x_data); +std::vector test{1,38.19,2,81.65}; +std::cout<predict(test); + +``` diff --git a/examples/methods/neighbors/decision_tree.cpp b/examples/methods/neighbors/decision_tree.cpp new file mode 100644 index 0000000..52ef16f --- /dev/null +++ b/examples/methods/neighbors/decision_tree.cpp @@ -0,0 +1,89 @@ +// #include "../../src/slowmokit/methods/neighbors/decision_tree.hpp" +// #include "../../src/slowmokit/core.hpp" + +// signed main(){ +// std::vector> x_data{ +// {0,23.76,3,76.56,1}, +// {1,12.76,2,87.45,0}, +// {1,21.86,1,79.98,1}, +// {0,32.64,1,76.87,1}, +// {0,22.76,3,89.90,0}, +// {1,28.64,0,73.87,1}, +// {0,12.87,3,82.86,0}, +// {0,33.87,2,80.97,1}, +// {1,39.64,1,70.87,1}, +// {0,28.90,2,89.86,1}, +// {0,13.76,3,72.56,0}, +// {1,19.76,2,88.45,1}, +// {0,16.86,1,78.98,0}, +// {0,32.44,1,73.87,1}, +// {1,22.76,3,80.93,1}, +// {0,28.64,0,78.87,0}, +// {1,8.87,2,81.96,0}, +// {0,31.87,2,75.97,0}, +// {1,27.64,1,71.89,1}, +// {0,20.90,2,80.86,0}, +// {0,23.76,3,76.56,1}, +// {1,12.76,2,87.45,1}, +// {1,21.86,1,79.98,1}, +// {0,32.64,1,76.87,1}, +// {0,22.76,3,89.90,0}, +// {1,28.64,0,73.87,1}, +// {0,12.87,3,82.86,0}, +// {0,33.87,2,80.97,1}, +// {1,39.64,1,70.87,1}, +// {0,28.90,2,89.86,0}, +// {0,13.76,3,72.56,0}, +// {1,19.76,2,88.45,1}, +// {0,16.86,1,78.98,0}, +// {0,32.44,1,73.87,1}, +// {1,22.76,3,80.93,1}, +// {0,28.64,0,78.87,0}, +// {1,8.87,2,81.96,1}, +// {0,31.87,2,75.97,0}, +// {1,27.64,1,71.89,0}, +// {0,20.90,2,80.86,1}, +// {0,32.64,1,76.87,1}, +// {0,22.76,3,89.90,0}, +// {1,28.64,0,73.87,1}, +// {0,12.87,3,82.86,0}, +// {0,33.87,2,80.97,1}, +// {1,39.64,1,70.87,1}, +// {0,28.90,2,89.86,1}, +// {0,13.76,3,72.56,0}, +// {1,19.76,2,88.45,1}, +// {0,16.86,1,78.98,0}, +// {0,32.44,1,73.87,1}, +// {1,22.76,3,80.93,1}, +// {0,28.64,0,78.87,0}, +// {1,8.87,2,81.96,0}, +// {0,31.87,2,75.97,0}, +// {1,27.64,1,71.89,1}, +// {0,20.90,2,80.86,1}, +// {0,23.76,3,76.56,1}, +// {1,12.76,2,87.45,0}, +// {1,21.86,1,79.98,1}, +// {0,32.64,1,76.87,1}, +// {0,22.76,3,89.90,0}, +// {1,28.64,0,73.87,1}, +// {0,12.87,3,82.86,0}, +// {0,33.87,2,80.97,0}, +// {1,39.64,1,70.87,0}, +// {0,28.90,2,89.86,1}, +// {0,13.76,3,72.56,1}, +// {1,19.76,2,88.45,0}, +// {0,16.86,1,78.98,1}, +// {0,32.44,1,73.87,1}, +// {1,22.76,3,80.93,0}, +// {0,28.64,0,78.87,0}, +// {1,8.87,2,81.96,1}, +// {0,31.87,2,75.97,0}, +// {1,27.64,1,71.89,1}, +// {0,20.90,2,80.86,0}, +// }; +// DecisionTree dt = new DecisionTree(9,0,4); +// dt->train(x_data); +// std::vector test{1,38.19,2,81.65}; +// std::cout<predict(test); +// return 0; +// } \ No newline at end of file diff --git a/src/slowmokit.hpp b/src/slowmokit.hpp index 6b8ac82..9dc3c41 100644 --- a/src/slowmokit.hpp +++ b/src/slowmokit.hpp @@ -18,6 +18,7 @@ #include "slowmokit/methods/linear_model/linear_regression.hpp" #include "slowmokit/methods/linear_model/logistic_regression.hpp" #include "slowmokit/methods/neighbors/bernoulli_nb.hpp" +#include "slowmokit/methods/neighbors/decision_tree.hpp" #include "slowmokit/methods/neighbors/gaussian_nb.hpp" #include "slowmokit/methods/neighbors/knn.hpp" diff --git a/src/slowmokit/ducks/matrix/matrix.cpp b/src/slowmokit/ducks/matrix/matrix.cpp new file mode 100644 index 0000000..a3311c1 --- /dev/null +++ b/src/slowmokit/ducks/matrix/matrix.cpp @@ -0,0 +1,180 @@ +/** + * @file ducks/matrix/matrix.cpp + * + * Implementation of the matrix main program + */ + +#include "matrix.hpp" + +template Matrix::Matrix(int n, int m) : n(n), m(m) +{ + if (n <= 0 or m <= 0) + throw std::out_of_range("\nCannot have non-positive dimension."); + + mat.resize(n, std::vector(m, T(0))); +} + +template Matrix::Matrix(const std::vector> in) +{ + if (std::size(in) <= 0 or std::size(in[0]) <= 0) + throw std::out_of_range("\nCannot have non-positive dimension."); + + n = std::size(in); + m = std::size(in[0]); + mat.resize(n, std::vector(m)); + + for (int i = 0; i < n; i++) + { + if (std::size(in[i]) != m) + throw std::invalid_argument("\nAll rows must have same dimension"); + + for (int j = 0; j < m; j++) + this->mat[i][j] = in[i][j]; + } +} + +template Matrix &Matrix::operator*=(const T &scalar) +{ + for (int i = 0; i < n; i++) + { + for (int j = 0; j < m; j++) + mat[i][j] *= scalar; + } + + return *this; +} + +template Matrix &Matrix::operator*=(const Matrix &rhs) +{ + auto [n2, m2] = rhs.getShape(); + + if (n2 <= 0 or m2 <= 0) + throw std::out_of_range("\nCannot have non-positive dimension."); + + if (m != n2) + throw std::invalid_argument("\nColumn dimension of matrix-1 must be equal " + "to row dimension of matrix-2"); + + auto lhs = this->mat; + std::vector res(n, std::vector(m2, T(0))); + + for (int i = 0; i < n; i++) + { + for (int j = 0; j < m2; j++) + { + for (int k = 0; k < m; k++) + res[i][j] += lhs[i][k] * rhs[k][j]; + } + } + + this->mat = res; + updateShape(); + + return *this; +} + +template Matrix &Matrix::operator+=(const Matrix &rhs) +{ + auto [n2, m2] = rhs.getShape(); + + if (n2 <= 0 or m2 <= 0) + throw std::out_of_range("\nCannot have non-positive dimension."); + + if (n != n2 or m != m2) + throw std::invalid_argument( + "\nBoth Dimension of matrix-1 must be equal to that of matrix-2"); + + for (int i = 0; i < n; i++) + { + for (int j = 0; j < m; j++) + this->mat[i][j] += rhs[i][j]; + } + + return *this; +} + +template Matrix &Matrix::operator-=(const Matrix &rhs) +{ + auto [n2, m2] = rhs.getShape(); + + if (n2 <= 0 or m2 <= 0) + throw std::out_of_range("\nCannot have non-positive dimension."); + + if (n != n2 or m != m2) + throw std::invalid_argument( + "\nBoth Dimension of matrix-1 must be equal to that of matrix-2"); + + for (int i = 0; i < n; i++) + { + for (int j = 0; j < m; j++) + this->mat[i][j] -= rhs[i][j]; + } + + return *this; +} + +template std::array Matrix::getShape() const +{ + return {this->n, this->m}; +} + +template T &Matrix::operator()(int i, int j) +{ + if (i >= n or i < 0) + throw std::out_of_range("\ni should be between 0 and " + + std::to_string(n - 1) + " inclusive"); + if (j >= m or j < 0) + throw std::out_of_range("\nj should be between 0 and " + + std::to_string(m - 1) + " inclusive"); + + return mat[i][j]; +} + +template const std::vector &Matrix::operator[](int i) const +{ + if (i >= n or i < 0) + throw std::out_of_range("\ni should be between 0 and " + + std::to_string(n - 1) + " inclusive"); + + return this->mat[i]; +} + +template +std::ostream &operator<<(std::ostream &os, const Matrix &matrix) +{ + int n = std::size(matrix); + int m = std::size(matrix[0]); + + for (int i = 0; i < n; i++) + { + for (int j = 0; j < m; j++) + { + if (j > 0) + os << " "; + os << matrix[i][j]; + } + + if (i != n - 1) + os << "\n"; + } + + return os; +} + +template Matrix operator*(Matrix lhs, const Matrix &rhs) +{ + lhs *= rhs; + return lhs; +} + +template Matrix operator+(Matrix lhs, const Matrix &rhs) +{ + lhs += rhs; + return lhs; +} + +template Matrix operator-(Matrix lhs, const Matrix &rhs) +{ + lhs -= rhs; + return lhs; +} \ No newline at end of file diff --git a/src/slowmokit/methods/neighbors/decision_tree.hpp b/src/slowmokit/methods/neighbors/decision_tree.hpp new file mode 100644 index 0000000..3ea935f --- /dev/null +++ b/src/slowmokit/methods/neighbors/decision_tree.hpp @@ -0,0 +1,13 @@ +/** + * @file methods/neighbors/decision_tree.hpp + * + * Easy include for Decision Tree algorithm + */ + + +#ifndef SLOWMOKIT_DECISION_TREE_HPP +#define SLOWMOKIT_DECISION_TREE_HPP + +#include "decision_tree/decision_tree.hpp" + +#endif // SLOWMOKIT_DECISION_TREE_HPP diff --git a/src/slowmokit/methods/neighbors/decision_tree/decision_tree.cpp b/src/slowmokit/methods/neighbors/decision_tree/decision_tree.cpp new file mode 100644 index 0000000..60adaf7 --- /dev/null +++ b/src/slowmokit/methods/neighbors/decision_tree/decision_tree.cpp @@ -0,0 +1,250 @@ +/** + * @file methods/neighbors/decision_tree/decision_tree.cpp + * + * Implementation of the Decsion Tree main program + */ + +#include "decision_tree.hpp" + +template double DecisionTree::entropy(std::vector col) +{ // computes the entropy of the data entropy = Sum(- + // p*(log2(p)) ) ; p = prob of each output class + std::set unique; + for (int i = 0; i < col.size(); i++) + { + unique.insert(col[i]); // computing the unique classes in output + } + double ent = 0.0; + std::set::iterator it; + for (it = unique.begin(); it != unique.end(); it++) + { + // cout<<*it< +std::vector>> DecisionTree::divideData( + std::vector> xData, int fkey, + int fval) // Divide the data on the basis of column(fkey) provided and + // value(fval) provided. +{ + std::vector> xLeft; // Left data + std::vector> xRight; // Right data + for (int i = 0; i < xData.size(); i++) + { // Noq in this loop we divide the data according tofval,fkey + double val = xData[i][fkey]; + std::vector temp = xData[i]; + if (val > fval) // Compring fval with value of that column data + { + xRight.push_back(temp); + } + else + { + xLeft.push_back(temp); + } + } + std::vector>> output {xLeft, xRight}; + return output; +} + +template +double DecisionTree::infoGain(std::vector> xData, + int fkey, int fval) +{ // fkey refers to the feature/column index; fval refers to the mean + // value/splitting value + // Splitting data + std::vector> left, right; + std::vector>> temp = + divideData(xData, fkey, fval); // first dividing data on the basis of + // considered splitting column and value + left = temp[0]; + right = temp[1]; + double l = left.size() / double(xData.size()); + double r = right.size() / double(xData.size()); + if (left.size() == 0 or + right.size() == 0) // Checks left and right nodes should not be empty + { + return -1; + } + std::vector y_data, lY, rY; // y_data = all y values ; lY = left node + // y values ; rY=right node y values + int y = xData[0].size() - 1; + for (int i = 0; i < xData.size(); i++) + { + y_data.push_back(xData[i][y]); + } + for (int i = 0; i < left.size(); i++) + { + lY.push_back(xData[i][y]); + } + for (int i = 0; i < right.size(); i++) + { + rY.push_back(xData[i][y]); + } + double iGain = entropy(y_data) - + (l * entropy(lY) + r * entropy(rY)); // computing infoGain + return iGain; +} + +template +DecisionTree::DecisionTree(int maxD, int minSamplesL, int maxF) +{ + this->maxDepth = maxD; + this->minSamplesLeaf = minSamplesL; + this->maxFeatures = maxF; // maxFeatures is the number of features to be + // considered to do splitting at each node +} + +template +void DecisionTree::train( + std::vector> xData) // training of model +{ + int max = xData[0].size() - 2; + int range = max + 1; + + features.clear(); // clearing features set to select features to be considered + + while (features.size() != maxFeatures) + { + features.insert(rand() % range); // Inserting features/columns + } + + std::vector infoGains( + maxFeatures); // vector for information gain of each column + std::vector mean(maxFeatures); // mean of each feature + int cnt = 0; + for (int i = 0; i < xData[0].size() - 1; i++) + { + if (i == *features.cbegin()) + { + features.erase(i); + for (int j = 0; j < xData.size(); j++) + { + mean[cnt] += xData[j][i]; + } + double iGain = infoGain(xData, i, mean[cnt] / double(xData.size())); + infoGains[cnt] = iGain; + cnt++; + } + } + max = 0; + for (int i = 0; i < maxFeatures; i++) + { + if (infoGains[i] > + infoGains[max]) // Comparison of info gains of each column. Column + { + max = i; + } + } + fkey = max; + fval = mean[max] / double(xData.size()); + + // Split data + std::vector>> temp = divideData( + xData, fkey, + fval); // dividing data on the basis of computed best information gain + std::vector> dataLeft = temp[0]; + std::vector> dataRight = temp[1]; + + if (dataLeft.size() <= minSamplesLeaf or + dataRight.size() <= + minSamplesLeaf) // Checking conditions like data in each node should + // be greater than minSamples leaf + { + double mean = 0.0; + int y = xData[0].size() - 1; + for (int i = 0; i < xData.size(); i++) + { + mean += xData[i][y]; + } + if (mean / double(xData.size()) >= + 0.5) // checking mean value of all y's, if mean>=0.5 then ouput + // predicted will be 1 + { + target = 1; + } + else // else if mean<0.5 output will be 0th class + { + target = 0; + } + return; + } + if (depth >= maxDepth) // Second condition we check is for depth. If current + // depth is greater than maximum Depth then it will not + // form further branches + { + double mean = 0.0; + int y = xData[0].size() - 1; + for (int i = 0; i < xData.size(); i++) + { + mean += xData[i][y]; + } + if (mean / double(xData.size()) >= + 0.5) // Same as above if mean value of y's >=0.5 predicted output class + // 1 + { + target = 1; + } + else // Predicted output class 0 + { + target = 0; + } + return; + } + + left = new DecisionTree(maxDepth, minSamplesLeaf, maxFeatures); // left node + left->depth = depth + 1; + left->train(dataLeft); + right = new DecisionTree(maxDepth, minSamplesLeaf, maxFeatures); // right node + right->depth = depth + 1; + right->train(dataRight); + + double mea = 0.0; + int y = xData[0].size() - 1; + for (int i = 0; i < xData.size(); i++) + { + mea += xData[i][y]; + } + if ((mea / double(xData.size())) >= 0.5) // Setting ouput class at leaf node. + { + target = 1; + } + else + { + target = 0; + } + return; +} + +template +int DecisionTree::predict( + std::vector test) // predicting ouput class for testing value +{ + if (test[fkey] > fval) // comparing fval to value of testing data column(fkey) + { + if (right == NULL) + { + return target; + } + return right->predict(test); + } + else + { + if (left == NULL) + { + return target; + } + return left->predict(test); + } +} \ No newline at end of file diff --git a/src/slowmokit/methods/neighbors/decision_tree/decision_tree.hpp b/src/slowmokit/methods/neighbors/decision_tree/decision_tree.hpp new file mode 100644 index 0000000..32dd24b --- /dev/null +++ b/src/slowmokit/methods/neighbors/decision_tree/decision_tree.hpp @@ -0,0 +1,76 @@ +/** + * @file methods/linear_models/linear_regression/linear_regression.hpp + * + * The header file including the linear regression algorithm + */ + +#ifndef SLOWMOKIT_DECISION_TREE_HPP +#define SLOWMOKIT_DECISION_TREE_HPP + +#include "../../../core.hpp" + +template class DecisionTree +{ + private: + DecisionTree *left = NULL; + DecisionTree *right = NULL; + int fkey = -1; + double fval = -1; + int maxDepth = -1; + int depth = 0; + int minSamplesLeaf = 1; + int target = -1; + int maxFeatures; + std::set features; + /** + * @brief calculate entropy + * + * @param col + * @return double entropy value + */ + double entropy(std::vector col); + /** + * @brief divides the data according to decided feature + * + * @param x_data training data + * @param fkey column on basis of which data is to be divided + * @param fval value on basis of which data is to be divided + * @return std::vector>> + */ + std::vector>> + divideData(std::vector> x_data, int fkey, int fval); + /** + * @brief information gain + * + * @param x_data training data + * @param fkey column to check highest info gain + * @param fval value to check highest info gain + * @return double value of information gain + */ + double infoGain(std::vector> x_data, int fkey, int fval); + + public: + /** + * @brief Construct a new Decision Tree object + * + * @param max_d maximum depth + * @param minSamplesL minimum samples to be in leaf node by default 1 + * @param max_f maximum samples to be in leaf nodes + */ + DecisionTree(int max_d, int minSamplesL, int max_f); + /** + * @brief prepare decision tree + * + * @param x_data all training values + */ + void train(std::vector> x_data); + /** + * @brief predict the testing values + * + * @param test testing values + * @return int class to which test value belongs + */ + int predict(std::vector test); +}; + +#endif \ No newline at end of file