From 9592182ccb0374fba48b5b870ac65f28d09a62f6 Mon Sep 17 00:00:00 2001 From: rmitsuboshi Date: Mon, 13 Jan 2025 13:51:20 +0900 Subject: [PATCH 1/4] minor updates in gbm.rs --- src/booster/gradient_boost/gbm.rs | 17 +++++++++-------- tests/gbm.rs | 6 +++--- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/booster/gradient_boost/gbm.rs b/src/booster/gradient_boost/gbm.rs index 9f36cc9..5803940 100644 --- a/src/booster/gradient_boost/gbm.rs +++ b/src/booster/gradient_boost/gbm.rs @@ -87,7 +87,7 @@ use std::ops::ControlFlow; /// /// println!("Training Loss is: {training_loss}"); /// ``` -pub struct GBM<'a, F> { +pub struct GBM<'a, F, L> { // Training data sample: &'a Sample, @@ -103,7 +103,7 @@ pub struct GBM<'a, F> { // Some struct that implements `LossFunction` trait - loss: GBMLoss, + loss: L, // Max iteration until GBM guarantees the optimality. @@ -122,11 +122,11 @@ pub struct GBM<'a, F> { -impl<'a, F> GBM<'a, F> +impl<'a, F, L> GBM<'a, F, L> { /// Initialize the `GBM`. /// This method sets some parameters `GBM` holds. - pub fn init(sample: &'a Sample) -> Self { + pub fn init_with_loss(sample: &'a Sample, loss: L) -> Self { let n_sample = sample.shape().0; let predictions = vec![0.0; n_sample]; @@ -138,7 +138,7 @@ impl<'a, F> GBM<'a, F> weights: Vec::new(), hypotheses: Vec::new(), - loss: GBMLoss::L2, + loss, max_iter: 100, @@ -150,7 +150,7 @@ impl<'a, F> GBM<'a, F> } -impl<'a, F> GBM<'a, F> { +impl<'a, F, L> GBM<'a, F, L> { /// Returns the maximum iteration /// of the `GBM` to find a combined hypothesis /// that has error at most `tolerance`. @@ -170,15 +170,16 @@ impl<'a, F> GBM<'a, F> { /// Set the Loss Type. - pub fn loss(mut self, loss_type: GBMLoss) -> Self { + pub fn loss(mut self, loss_type: L) -> Self { self.loss = loss_type; self } } -impl Booster for GBM<'_, F> +impl Booster for GBM<'_, F, L> where F: Regressor + Clone, + L: LossFunction, { type Output = WeightedMajority; diff --git a/tests/gbm.rs b/tests/gbm.rs index 78b5c88..9bfd1b9 100644 --- a/tests/gbm.rs +++ b/tests/gbm.rs @@ -9,19 +9,19 @@ pub mod gbm_boston { #[test] fn boston() { let mut path = env::current_dir().unwrap(); - path.push("tests/dataset/boston_housing.csv"); + path.push("tests/dataset/california-housing.csv"); let sample = SampleReader::new() .file(path) .has_header(true) - .target_feature("target") + .target_feature("MedHouseVal") .read() .unwrap(); let n_sample = sample.shape().0 as f64; - let mut gbm = GBM::init(&sample) + let mut gbm = GBM::init_with_loss(&sample, GBMLoss::L2) .loss(GBMLoss::L2); let tree = RegressionTreeBuilder::new(&sample) .max_depth(3) From 1f44b400cc598d519ce83763615a634dd9dcdc74 Mon Sep 17 00:00:00 2001 From: rmitsuboshi Date: Sun, 12 Jan 2025 14:46:42 +0900 Subject: [PATCH 2/4] minor updates --- src/booster/mlpboost.rs | 2 +- src/booster/mlpboost/mlpboost_algorithm.rs | 4 +- src/booster/mlpboost/perturbed_lp_model.rs | 225 ++++++++++++++++++ src/common/checker.rs | 23 +- src/common/frank_wolfe.rs | 1 - src/common/utils.rs | 89 ++----- src/weak_learner/decision_tree/criterion.rs | 52 ++-- .../decision_tree/decision_tree_algorithm.rs | 18 +- 8 files changed, 296 insertions(+), 118 deletions(-) create mode 100644 src/booster/mlpboost/perturbed_lp_model.rs diff --git a/src/booster/mlpboost.rs b/src/booster/mlpboost.rs index 01fb4b3..a7a23f9 100644 --- a/src/booster/mlpboost.rs +++ b/src/booster/mlpboost.rs @@ -3,7 +3,7 @@ pub mod mlpboost_algorithm; #[cfg(not(feature="gurobi"))] -mod lp_model; +mod perturbed_lp_model; #[cfg(feature="gurobi")] mod gurobi_lp_model; diff --git a/src/booster/mlpboost/mlpboost_algorithm.rs b/src/booster/mlpboost/mlpboost_algorithm.rs index 625cc45..bf3c70f 100644 --- a/src/booster/mlpboost/mlpboost_algorithm.rs +++ b/src/booster/mlpboost/mlpboost_algorithm.rs @@ -4,7 +4,7 @@ //! #[cfg(not(feature="gurobi"))] -use super::lp_model::LPModel; +use super::perturbed_lp_model::LPModel; #[cfg(feature="gurobi")] use super::gurobi_lp_model::LPModel; @@ -271,7 +271,7 @@ impl<'a, F> MLPBoost<'a, F> { // `ub` is the upper-bound of distribution for each example. let ub = 1.0 / self.nu; - let lp_model = RefCell::new(LPModel::init(self.n_sample, ub)); + let lp_model = RefCell::new(LPModel::init(self.eta, self.n_sample, ub)); self.secondary = Some(lp_model); } diff --git a/src/booster/mlpboost/perturbed_lp_model.rs b/src/booster/mlpboost/perturbed_lp_model.rs new file mode 100644 index 0000000..e288470 --- /dev/null +++ b/src/booster/mlpboost/perturbed_lp_model.rs @@ -0,0 +1,225 @@ +use clarabel::{ + algebra::*, + solver::*, +}; + +use rand; +use rand::rngs::StdRng; +use rand::prelude::*; + +use crate::{ + Sample, + common::utils, +}; + +use crate::hypothesis::Classifier; + +use std::iter; + +const SEED: u64 = 7777; + +/// A linear programming model for edge minimization with perturbation. +/// `LPModel` solves the following: +/// +/// ```txt +/// min γ - (1/η) Σ_i [ r_i * d_i ] +/// γ,d +/// s.t. Σ_i d_i y_i h_j (x_i) ≤ γ, ∀j = 1, 2, ..., t +/// Σ_i d_i = 1, +/// d_1, d_2, ..., d_m ≤ 1/ν +/// d_1, d_2, ..., d_m ≥ 0. +/// ``` +/// +/// To solve the problem we build the constraint matrix +/// ```txt +/// # of +/// rows γ d1 ... dm +/// ┏ ┃ ┓ ┏ ┓ +/// 1 ┃ 0 ┃ 1 ... 1 ┃ = ┃ 1 ┃ +/// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +/// ┃ 0 ┃ ┃ ≤ ┃ 0 ┃ +/// ┃ 0 ┃ ┃ ≤ ┃ 0 ┃ +/// ┃ . ┃ (-1) * Identity matrix ┃ . ┃ . ┃ +/// m ┃ . ┃ m x m ┃ . ┃ . ┃ +/// ┃ . ┃ ┃ . ┃ . ┃ +/// ┃ 0 ┃ ┃ ≤ ┃ 0 ┃ +/// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +/// ┃ 0 ┃ ┃ ≤ ┃ 1/ν ┃ +/// ┃ 0 ┃ ┃ ≤ ┃ 1/ν ┃ +/// m ┃ . ┃ Identity matrix ┃ . ┃ . ┃ +/// ┃ . ┃ m x m ┃ . ┃ . ┃ +/// ┃ . ┃ ┃ . ┃ . ┃ +/// ┃ 0 ┃ ┃ ≤ ┃ 1/ν ┃ +/// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +/// ┃ -1 ┃ y_1 h_1(x_1) ... y_m h_1(x_m) ┃ ≤ ┃ 0 ┃ +/// ┃ -1 ┃ y_1 h_2(x_1) ... y_m h_2(x_m) ┃ ≤ ┃ 0 ┃ +/// ┃ . ┃ . ... . ┃ . ┃ . ┃ +/// H ┃ . ┃ . ... . ┃ . ┃ . ┃ +/// ┃ . ┃ . ... . ┃ . ┃ . ┃ +/// ┃ -1 ┃ y_1 h_T(x_1) ... y_m h_T(x_m) ┃ ≤ ┃ 0 ┃ +/// ┗ ┃ ┛ ┗ ┛ +/// +/// # of +/// cols 1 ┃ m +/// ``` +pub(super) struct LPModel { + pub(self) n_examples: usize, // number of columns + pub(self) n_hypotheses: usize, // number of rows + pub(self) margins: Vec>, // margin vectors + pub(self) weights: Vec, // weight on hypothesis + pub(self) rng: StdRng, // Rng + pub(self) dist: Vec, // distribution over examples + pub(self) cap_inv: f64, // the capping parameter, `1/ν.` + pub(self) eta: f64, // regularization parameter +} + + +impl LPModel { + /// Initialize the LP model. + /// arguments. + /// - `size`: Number of variables (Number of examples). + /// - `upper_bound`: Capping parameter. `[1, size]`. + pub(super) fn init(eta: f64, size: usize, upper_bound: f64) -> Self { + let margins = vec![vec![]; size]; + Self { + n_examples: size, + n_hypotheses: 0usize, + rng: rand::SeedableRng::seed_from_u64(SEED), + margins, + weights: Vec::with_capacity(0usize), + dist: Vec::with_capacity(0usize), + cap_inv: upper_bound, + eta, + } + } + + + /// Solve the edge minimization problem + /// over the hypotheses `h1, ..., ht` + /// and outputs the optimal value. + pub(super) fn update( + &mut self, + sample: &Sample, + opt_h: Option<&F> + ) -> Vec + where F: Classifier + { + if let Some(clf) = opt_h { + self.n_hypotheses += 1; + let margins = utils::margins_of_hypothesis(sample, clf); + self.margins.iter_mut() + .zip(margins) + .for_each(|(mvec, yh)| { mvec.push(yh); }); + let constraint_matrix = self.build_constraint_matrix(); + let sense = self.build_sense(); + let rhs = self.build_rhs(); + + + let settings = DefaultSettingsBuilder::default() + .equilibrate_enable(true) + .verbose(false) + .build() + .unwrap(); + let linear = self.build_linear_part_objective(); + let n_variables = self.n_examples + 1; + let zero_mat = CscMatrix::::zeros((n_variables, n_variables)); + let mut solver = DefaultSolver::new( + &zero_mat, + &linear, + &constraint_matrix, + &rhs, + &sense[..], + settings + ); + + solver.solve(); + + let ix = 2 * self.n_examples + 1; + self.weights = solver.solution.z[ix..].to_vec(); + self.dist = solver.solution.x[1..].to_vec(); + } + + self.weights.clone() + } + + + pub(self) fn build_linear_part_objective(&mut self) -> Vec { + std::iter::once(1f64) + .chain((0..self.n_examples).map(|_| { + let r = self.rng.gen::(); + (1f64 / r).ln().ln() / self.eta + })) + .collect() + } + + + /// Build the constraint matrix in the 0-indexed CSC form. + pub(self) fn build_constraint_matrix(&self) -> CscMatrix:: { + let n_rows = 1 + 2*self.n_examples + self.n_hypotheses; + let n_cols = 1 + self.n_examples; + + let mut col_ptr = Vec::new(); + let mut row_val = Vec::new(); + let mut nonzero = Vec::new(); + + // the first index of margin constraints + let gam = 1 + 2 * self.n_examples; + col_ptr.push(0); + row_val.extend(gam..n_rows); + nonzero.extend(iter::repeat(-1f64).take(n_rows - gam)); + + for (j, margins) in (1..).zip(&self.margins) { + col_ptr.push(row_val.len()); + // the sum constraint: `Σ_i d_i = 1` + row_val.push(0); + nonzero.push(1f64); + + // non-negative constraint: `d_i ≥ 0` + row_val.push(j); + nonzero.push(-1f64); + + // capping constraint: `d_i ≤ 1/ν` + row_val.push(self.n_examples + j); + nonzero.push(self.cap_inv); + + // margin constraints of `i`-th column + for (i, &yh) in (0..).zip(margins) { + row_val.push(gam + i); + nonzero.push(yh); + } + } + col_ptr.push(row_val.len()); + + CscMatrix::new( + n_rows, + n_cols, + col_ptr, + row_val, + nonzero, + ) + } + + + /// Build the vector of constraint sense: `[=, ≤, ≥, ...].` + pub(self) fn build_sense(&self) -> Vec> { + let n_ineq = 2*self.n_examples + self.n_hypotheses; + vec![ + ZeroConeT(1), + NonnegativeConeT(n_ineq), + ] + } + + + /// Build the right-hand-side of the constraints. + pub(self) fn build_rhs(&self) -> Vec { + let n_constraints = 1 + 2*self.n_examples + self.n_hypotheses; + let mut rhs = Vec::with_capacity(n_constraints); + rhs.push(1f64); + rhs.extend(iter::repeat(0f64).take(self.n_examples)); + rhs.extend(iter::repeat(self.cap_inv).take(self.n_examples)); + rhs.extend(iter::repeat(0f64).take(self.n_hypotheses)); + rhs + } +} + + diff --git a/src/common/checker.rs b/src/common/checker.rs index c00c018..9e88e6a 100644 --- a/src/common/checker.rs +++ b/src/common/checker.rs @@ -4,6 +4,9 @@ use crate::Sample; +const SIMPLEX_TOLERANCE: f64 = 1e-5; + + /// Check whether the training sample is valid or not. #[inline(always)] pub(crate) fn check_sample(sample: &Sample) @@ -26,13 +29,13 @@ pub(crate) fn check_sample(sample: &Sample) #[inline(always)] pub(crate) fn check_nu(nu: f64, n_sample: usize) { let n_sample = n_sample as f64; - assert!((1.0..=n_sample).contains(&nu)); + assert!((1f64..=n_sample).contains(&nu)); } /// Check the stepsize #[inline(always)] pub(crate) fn check_stepsize(size: f64) { - assert!((0.0..=1.0).contains(&size)); + assert!((0f64..=1f64).contains(&size)); } @@ -46,8 +49,18 @@ pub(crate) fn check_capped_simplex_condition( check_nu(nu, length); let sum = slice.iter().sum::(); - assert!((sum - 1.0).abs() < 1e-6, "Got sum = {sum}"); + let diff = (sum - 1f64).abs(); + if diff > SIMPLEX_TOLERANCE { + println!(">> diff is {diff} > {SIMPLEX_TOLERANCE}"); + println!(">> sum = {sum}"); + assert!((sum - 1f64).abs() < SIMPLEX_TOLERANCE, "sum(dist[..]) = {sum}"); + } + assert!((sum - 1f64).abs() < SIMPLEX_TOLERANCE, "sum(dist[..]) = {sum}"); - let ub = 1.0 / nu; - assert!(slice.iter().all(|s| (0.0..=ub).contains(s))); + let ub = 1f64 / nu; + assert!( + slice.iter().all(|s| (0f64..=ub).contains(s)), + "capping constraint is violated!" + ); } + diff --git a/src/common/frank_wolfe.rs b/src/common/frank_wolfe.rs index f4c54ed..7ef873d 100644 --- a/src/common/frank_wolfe.rs +++ b/src/common/frank_wolfe.rs @@ -330,7 +330,6 @@ impl FrankWolfe { // Pairwise update! let max_stepsize = weights[position_of_worst_one]; - // TODO // Find the best stepsize by line-search let local_best_margins = utils::margins_of_hypothesis( sample, &hypotheses[position_of_local_best_one] diff --git a/src/common/utils.rs b/src/common/utils.rs index 93528f2..98b3475 100644 --- a/src/common/utils.rs +++ b/src/common/utils.rs @@ -6,6 +6,7 @@ use rayon::prelude::*; use crate::{Sample, Classifier}; use crate::common::checker; + /// Returns the edge of a single hypothesis for the given distribution. /// Here `edge` is the weighted training loss. /// @@ -167,53 +168,6 @@ pub fn exp_distribution_from_margins( } -// /// Projects the given distribution onto the capped simplex. -// /// Capped simplex with parameter `ν (nu)` is defined as -// /// -// /// ```txt -// /// Δ_{m, ν} := { d ∈ [0, 1/ν]^m | sum( d[i] ) = 1 } -// /// ``` -// /// -// /// That is, each coordinate takes at most `1/ν`. -// /// Specifying `ν = 1` yields the no-capped simplex. -// #[inline(always)] -// pub fn project_distribution_to_capped_simplex( -// nu: f64, -// iter: I, -// ) -> Vec -// where I: Iterator, -// { -// let mut dist: Vec<_> = iter.collect(); -// let n_sample = dist.len(); -// -// // Construct a vector of indices sorted in descending order of `dist`. -// let mut ix = (0..n_sample).collect::>(); -// ix.sort_by(|&i, &j| dist[j].partial_cmp(&dist[i]).unwrap()); -// -// let mut sum = dist.iter().sum::(); -// -// let ub = 1.0 / nu; -// -// -// let mut ix = ix.into_iter().enumerate(); -// for (k, i) in ix.by_ref() { -// let xi = (1.0 - ub * k as f64) / sum; -// if xi * dist[i] <= ub { -// dist[i] = xi * dist[i]; -// for (_, j) in ix { -// dist[j] = xi * dist[j]; -// } -// break; -// } -// sum -= dist[i]; -// dist[i] = ub; -// } -// -// checker::check_capped_simplex_condition(&dist, nu); -// dist -// } - - /// Projects the given logarithmic distribution onto the capped simplex. /// Capped simplex with parameter `ν (nu)` is defined as /// @@ -233,37 +187,12 @@ pub fn project_log_distribution_to_capped_simplex( let mut dist: Vec<_> = iter.collect(); let n_sample = dist.len(); - // Construct a vector of indices sorted in descending order of `dist`. + // Construct a vector of indices `ix.` let mut ix = (0..n_sample).collect::>(); + // sort `ix` in the descending order of `dist`. ix.sort_by(|&i, &j| dist[j].partial_cmp(&dist[i]).unwrap()); - // let mut ln_z = dist[ix[0]]; - // for &i in &ix[1..] { - // let ln_d = dist[i]; - // let small = ln_z.min(ln_d); - // let large = ln_z.max(ln_d); - // ln_z = large + (1f64 + (small - large).exp()).ln(); - // } - - // let v = 1f64 / nu; - // let ln_v = v.ln(); - - // for i in 0..n_sample { - // let ln_xi = (1f64 - v * i as f64).ln() - ln_z; - // let ln_di = dist[ix[i]]; - // if ln_xi + ln_di <= ln_v { - // for j in i..n_sample { - // let ln_dj = dist[ix[j]]; - // dist[ix[j]] = (ln_xi + ln_dj).exp(); - // } - // break; - // } - // dist[ix[i]] = v; - // ln_z = ln_z + (1f64 - (ln_di - ln_z).exp()).ln(); - // } - - // `logsums[k] = ln( sum_{i=0}^{k-1} exp( -η (Aw)i ) ) let mut logsums: Vec = Vec::with_capacity(n_sample); ix.iter().rev() @@ -288,6 +217,18 @@ pub fn project_log_distribution_to_capped_simplex( let mut ix_with_logsum = ix.into_iter().zip(logsums).enumerate(); + // NOTE: + // The following is the efficient projection + // onto the probability simplex capped by `1/ν.` + // This code comes from the paper: + // + // Shai Shalev-Shwartz and Yoram Singer. + // On the equivalence of weak learnability and linear separability: + // new relaxations and efficient boosting algorithms. + // [Journal of Machine Learning 2010] + // + // Note that the parameter `ν` in the paper corresponds to + // `1/nu` in this code. for (i, (i_sorted, logsum)) in ix_with_logsum.by_ref() { let log_xi = (1.0 - ub * i as f64).ln() - logsum; // TODO replace this line by `get_unchecked` diff --git a/src/weak_learner/decision_tree/criterion.rs b/src/weak_learner/decision_tree/criterion.rs index 52aa27c..a4a91d2 100644 --- a/src/weak_learner/decision_tree/criterion.rs +++ b/src/weak_learner/decision_tree/criterion.rs @@ -186,13 +186,13 @@ fn split_by_entropy(pack: Vec<(Bin, LabelToWeight)>) let mut left_weight = LabelToWeight::new(); - let mut left_weight_sum = 0.0; + let mut left_weight_sum = 0f64; let mut right_weight = LabelToWeight::new(); - let mut right_weight_sum = 0.0; + let mut right_weight_sum = 0f64; for (_, mp) in pack.iter() { for (y, w) in mp.iter() { - let entry = right_weight.entry(*y).or_insert(0.0); + let entry = right_weight.entry(*y).or_insert(0f64); *entry += w; right_weight_sum += w; } @@ -203,7 +203,7 @@ fn split_by_entropy(pack: Vec<(Bin, LabelToWeight)>) for (bin, map) in pack { for (y, w) in map { - let entry = left_weight.entry(y).or_insert(0.0); + let entry = left_weight.entry(y).or_insert(0f64); *entry += w; left_weight_sum += w; let entry = right_weight.get_mut(&y).unwrap(); @@ -211,7 +211,7 @@ fn split_by_entropy(pack: Vec<(Bin, LabelToWeight)>) right_weight_sum -= w; } let lp = left_weight_sum / weight_sum; - let rp = (1.0 - lp).max(0.0); + let rp = (1f64 - lp).max(0f64); let left_impurity = entropic_impurity(&left_weight); let right_impurity = entropic_impurity(&right_weight); @@ -244,7 +244,7 @@ fn split_by_edge(pack: Vec<(Bin, LabelToWeight)>) -> (f64, Score) { for (bin, map) in pack { - edge -= 2.0 * map.into_iter() + edge -= 2f64 * map.into_iter() .map(|(y, d)| y as f64 * d) .sum::(); @@ -266,13 +266,13 @@ fn split_by_gini(pack: Vec<(Bin, LabelToWeight)>) -> (f64, Score) { let mut left_weight = LabelToWeight::new(); - let mut left_weight_sum = 0.0; + let mut left_weight_sum = 0f64; let mut right_weight = LabelToWeight::new(); - let mut right_weight_sum = 0.0; + let mut right_weight_sum = 0f64; for (_, mp) in pack.iter() { for (y, w) in mp.iter() { - let entry = right_weight.entry(*y).or_insert(0.0); + let entry = right_weight.entry(*y).or_insert(0f64); *entry += w; right_weight_sum += w; } @@ -283,7 +283,7 @@ fn split_by_gini(pack: Vec<(Bin, LabelToWeight)>) -> (f64, Score) { for (bin, map) in pack { for (y, w) in map { - let entry = left_weight.entry(y).or_insert(0.0); + let entry = left_weight.entry(y).or_insert(0f64); *entry += w; left_weight_sum += w; let entry = right_weight.get_mut(&y).unwrap(); @@ -291,10 +291,10 @@ fn split_by_gini(pack: Vec<(Bin, LabelToWeight)>) -> (f64, Score) { right_weight_sum -= w; - if *entry <= 0.0 { right_weight.remove(&y); } + if *entry <= 0f64 { right_weight.remove(&y); } } let lp = left_weight_sum / weight_sum; - let rp = (1.0 - lp).max(0.0); + let rp = (1f64 - lp).max(0f64); let left_impurity = gini_impurity(&left_weight); let right_impurity = gini_impurity(&right_weight); @@ -318,26 +318,26 @@ fn split_by_twoing(pack: Vec<(Bin, LabelToWeight)>) -> (f64, Score) { let mut labels = HashSet::new(); for (_, mp) in pack.iter() { for (y, w) in mp.iter() { - let entry = right_weight.entry(*y).or_insert(0.0); + let entry = right_weight.entry(*y).or_insert(0f64); *entry += w; labels.insert(*y); } } - let mut best_score = 0.0; + let mut best_score = 0f64; let mut best_threshold = f64::MIN; for (bin, map) in pack { // Move the weights in a `pack` from right to left. for (y, w) in map { - let entry = left_weight.entry(y).or_insert(0.0); + let entry = left_weight.entry(y).or_insert(0f64); *entry += w; if let Some(entry) = right_weight.get_mut(&y) { *entry -= w; } - if *entry <= 0.0 { right_weight.remove(&y); } + if *entry <= 0f64 { right_weight.remove(&y); } } @@ -358,12 +358,12 @@ fn split_by_twoing(pack: Vec<(Bin, LabelToWeight)>) -> (f64, Score) { #[inline(always)] pub(self) fn entropic_impurity(map: &HashMap) -> f64 { let total = map.values().sum::(); - if total <= 0.0 || map.is_empty() { return 0.0.into(); } + if total <= 0f64 || map.is_empty() { return 0f64.into(); } map.par_iter() .map(|(_, &p)| { let r = p / total; - if r <= 0.0 { 0.0 } else { -r * r.ln() } + if r <= 0f64 { 0f64 } else { -r * r.ln() } }) .sum::() } @@ -373,13 +373,13 @@ pub(self) fn entropic_impurity(map: &HashMap) -> f64 { #[inline(always)] pub(self) fn gini_impurity(map: &HashMap) -> f64 { let total = map.values().sum::(); - if total <= 0.0 || map.is_empty() { return 0.0.into(); } + if total <= 0f64 || map.is_empty() { return 0f64.into(); } let correct = map.par_iter() .map(|(_, &w)| (w / total).powi(2)) .sum::(); - (1.0 - correct).max(0.0) + (1f64 - correct).max(0f64) } @@ -395,17 +395,17 @@ pub(self) fn twoing_score( let pr = right.values().sum::(); let pt = pl + pr; - if pl == 0.0 || pr == 0.0 { return 0.0; } - assert!(pt > 0.0); + if pl == 0f64 || pr == 0f64 { return 0f64; } + assert!(pt > 0f64); - let mut score = 0.0; + let mut score = 0f64; for y in labels { - let l = left.get(y).unwrap_or(&0.0); - let r = right.get(y).unwrap_or(&0.0); + let l = left.get(y).unwrap_or(&0f64); + let r = right.get(y).unwrap_or(&0f64); score += ((l / pl) - (r / pr)).abs(); } - score = score.powi(2) * pl * pr / (2.0 * pt).powi(2); + score = score.powi(2) * pl * pr / (2f64 * pt).powi(2); score } diff --git a/src/weak_learner/decision_tree/decision_tree_algorithm.rs b/src/weak_learner/decision_tree/decision_tree_algorithm.rs index 23b7fb0..80f69da 100644 --- a/src/weak_learner/decision_tree/decision_tree_algorithm.rs +++ b/src/weak_learner/decision_tree/decision_tree_algorithm.rs @@ -58,7 +58,7 @@ use std::collections::HashMap; /// .criterion(Criterion::Entropy) /// .build(); /// -/// let n_sample = sample.shape().0; +/// let n_sample = sample.shape()f64; /// let dist = vec![1f64 / n_sample as f64; n_sample]; /// let f = tree.produce(&sample, &dist); /// @@ -67,7 +67,7 @@ use std::collections::HashMap; /// let loss = sample.target() /// .into_iter() /// .zip(predictions) -/// .map(|(ty, py)| if *ty == py as f64 { 0.0 } else { 1.0 }) +/// .map(|(ty, py)| if *ty == py as f64 { 0f64 } else { 1f64 }) /// .sum::() /// / n_sample as f64; /// println!("loss (train) is: {loss}"); @@ -116,7 +116,7 @@ impl<'a> DecisionTree<'a> { // If sum of `dist` over `train` is zero, construct a leaf node. - if loss == 0.0 || depth < 1 { + if loss == 0f64 || depth < 1 { return TrainNode::leaf(conf, total_weight, loss); } @@ -194,7 +194,7 @@ impl<'a> WeakLearner for DecisionTree<'a> { { let n_sample = sample.shape().0; - let indices = (0..n_sample).filter(|&i| dist[i] > 0.0) + let indices = (0..n_sample).filter(|&i| dist[i] > 0f64) .collect::>(); assert_ne!(indices.len(), 0); @@ -238,7 +238,7 @@ fn confidence_and_loss(sample: &Sample, dist: &[f64], indices: &[usize]) for &i in indices { let l = target[i] as i64; - let cnt = counter.entry(l).or_insert(0.0); + let cnt = counter.entry(l).or_insert(0f64); *cnt += dist[i]; } @@ -253,13 +253,13 @@ fn confidence_and_loss(sample: &Sample, dist: &[f64], indices: &[usize]) // From the update rule of boosting algorithm, // the sum of `dist` over `indices` may become zero, - let loss = if total > 0.0 { total * (1.0 - (p / total)) } else { 0.0 }; + let loss = if total > 0f64 { total * (1f64 - (p / total)) } else { 0f64 }; // `label` takes value in `{-1, +1}`. - let confidence = if total > 0.0 { - (label as f64 * (2.0 * (p / total) - 1.0)).clamp(-1.0, 1.0) + let confidence = if total > 0f64 { + (label as f64 * (2f64 * (p / total) - 1f64)).clamp(-1f64, 1f64) } else { - (label as f64).clamp(-1.0, 1.0) + (label as f64).clamp(-1f64, 1f64) }; let confidence = Confidence::from(confidence); From 3347cf5c3ba94c9c2203695c1370c9da8a6b31aa Mon Sep 17 00:00:00 2001 From: rmitsuboshi Date: Mon, 13 Jan 2025 13:40:53 +0900 Subject: [PATCH 3/4] remove unused crate --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7b9b6f9..5c73264 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,6 @@ all-features = true [dependencies] grb = { version = "2.0.1", optional = true } clarabel = { version = "0.9.0" } -plotters = { version = "0.3.5" } rand = { version = "0.8.5" } rand_distr = { version = "0.4.3" } rayon = { version = "1.10.0" } From 4a156aa722ad8a92b542988b7e1a229468230215 Mon Sep 17 00:00:00 2001 From: rmitsuboshi Date: Mon, 13 Jan 2025 16:59:38 +0900 Subject: [PATCH 4/4] introduce LossFunction trait to gradient boosting and regression tree. from now on, you can use custom loss function. --- src/booster/gradient_boost/gbm.rs | 7 +- src/common/loss_functions.rs | 82 +++++++- src/lib.rs | 1 - src/prelude.rs | 2 +- src/weak_learner.rs | 1 - src/weak_learner/regression_tree.rs | 4 - src/weak_learner/regression_tree/bin.rs | 66 +++--- src/weak_learner/regression_tree/builder.rs | 25 ++- src/weak_learner/regression_tree/loss.rs | 189 ------------------ .../regression_tree_algorithm.rs | 171 +++++++++++++--- tests/gbm.rs | 18 +- 11 files changed, 272 insertions(+), 294 deletions(-) delete mode 100644 src/weak_learner/regression_tree/loss.rs diff --git a/src/booster/gradient_boost/gbm.rs b/src/booster/gradient_boost/gbm.rs index 5803940..ce042ef 100644 --- a/src/booster/gradient_boost/gbm.rs +++ b/src/booster/gradient_boost/gbm.rs @@ -57,13 +57,12 @@ use std::ops::ControlFlow; /// // Note that the default tolerance parameter is set as `1 / n_sample`, /// // where `n_sample = data.shape().0` is /// // the number of training examples in `data`. -/// let booster = GBM::init(&sample) -/// .loss(GBMLoss::L1); +/// let booster = GBM::init_with_loss(&sample, GBMLoss::L2); /// /// // Set the weak learner with setting parameters. /// let weak_learner = RegressionTreeBuilder::new(&sample) /// .max_depth(2) -/// .loss(LossType::L1) +/// .loss(LossType::L2) /// .build(); /// /// // Run `GBM` and obtain the resulting hypothesis `f`. @@ -80,7 +79,7 @@ use std::ops::ControlFlow; /// let training_loss = sample.target() /// .into_iter() /// .zip(predictions) -/// .map(|(y, fx)| (y - fx).abs()) +/// .map(|(y, fx)| (y - fx).powi(2)) /// .sum::() /// / n_sample; /// diff --git a/src/common/loss_functions.rs b/src/common/loss_functions.rs index 31a9978..792a24b 100644 --- a/src/common/loss_functions.rs +++ b/src/common/loss_functions.rs @@ -20,10 +20,16 @@ pub trait LossFunction { / n_items as f64 } - /// Gradient vector at current point. + /// Gradient vector at the current point. fn gradient(&self, predictions: &[f64], target: &[f64]) -> Vec; + /// Hessian at the current point. + /// Here, this method assumes that the Hessian is diagonal, + /// so that it returns a diagonal vector. + fn hessian(&self, predictions: &[f64], target: &[f64]) -> Vec; + + /// Best coffecient for the newly-attained hypothesis. fn best_coefficient( &self, @@ -45,6 +51,17 @@ pub enum GBMLoss { /// This loss function is also known as /// **Mean Squared Error (MSE)**. L2, + + + // /// Huber loss with parameter `delta`. + // /// Huber loss maps the given scalar `z` to + // /// `0.5 * z.powi(2)` if `z.abs() < delta`, + // /// `delta * (z.abs() - 0.5 * delta)`, otherwise. + // Huber(f64), + + + // /// Quantile loss + // Quantile(f64), } @@ -53,6 +70,7 @@ impl LossFunction for GBMLoss { match self { Self::L1 => "L1 loss", Self::L2 => "L2 loss", + // Self::Huber(_) => "Huber loss", } } @@ -61,6 +79,14 @@ impl LossFunction for GBMLoss { match self { Self::L1 => (prediction - true_value).abs(), Self::L2 => (prediction - true_value).powi(2), + // Self::Huber(delta) => { + // let diff = (prediction - true_value).abs(); + // if diff < *delta { + // 0.5 * diff.powi(2) + // } else { + // delta * (diff - 0.5 * delta) + // } + // }, } } @@ -73,17 +99,59 @@ impl LossFunction for GBMLoss { match self { Self::L1 => { - predictions.iter() - .zip(target) - .map(|(p, y)| (p - y).signum() / n_sample) + target.iter() + .zip(predictions) + .map(|(y, p)| (y - p).signum()) + .collect() + }, + Self::L2 => { + target.iter() + .zip(predictions) + .map(|(y, p)| p - y) + .collect() + }, + // Self::Huber(delta) => { + // target.iter() + // .zip(predictions) + // .map(|(y, p)| { + // let diff = y - p; + // if diff.abs() < *delta { + // -diff + // } else { + // delta * diff.signum() + // } + // }) + // .collect::>() + // }, + } + } + + + fn hessian(&self, predictions: &[f64], target: &[f64]) -> Vec + { + let n_sample = predictions.len(); + assert_eq!(n_sample as usize, target.len()); + + match self { + Self::L1 => { + std::iter::repeat(0f64) + .take(n_sample) .collect() }, Self::L2 => { - predictions.iter() - .zip(target) - .map(|(p, y)| 2.0 * (p - y) / n_sample) + std::iter::repeat(1f64) + .take(n_sample) .collect() }, + // Self::Huber(delta) => { + // target.iter() + // .zip(predictions) + // .map(|(y, p)| { + // let diff = (y - p).abs(); + // if diff < *delta { 1f64 } else { 0f64 } + // }) + // .collect::>() + // }, } } diff --git a/src/lib.rs b/src/lib.rs index bc7b82c..4c9df33 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -216,7 +216,6 @@ pub use weak_learner::{ RegressionTree, RegressionTreeBuilder, RegressionTreeRegressor, - LossType, }; /// Some useful functions / traits diff --git a/src/prelude.rs b/src/prelude.rs index fee9c03..2f0c9ec 100644 --- a/src/prelude.rs +++ b/src/prelude.rs @@ -64,7 +64,6 @@ pub use crate::weak_learner::{ RegressionTree, RegressionTreeBuilder, RegressionTreeRegressor, - LossType, }; @@ -81,6 +80,7 @@ pub use crate::{ pub use crate::common::{ loss_functions::GBMLoss, + loss_functions::LossFunction, frank_wolfe::FWType, }; diff --git a/src/weak_learner.rs b/src/weak_learner.rs index 502aa2e..c851f96 100644 --- a/src/weak_learner.rs +++ b/src/weak_learner.rs @@ -43,7 +43,6 @@ pub use self::naive_bayes::{ pub use self::regression_tree::{ - LossType, RegressionTree, RegressionTreeBuilder, RegressionTreeRegressor, diff --git a/src/weak_learner/regression_tree.rs b/src/weak_learner/regression_tree.rs index 3dfdaa8..9abcc14 100644 --- a/src/weak_learner/regression_tree.rs +++ b/src/weak_learner/regression_tree.rs @@ -3,9 +3,6 @@ mod regression_tree_algorithm; // This file defines the regression tree regressor. mod regression_tree_regressor; -// This file defines the loss type. -mod loss; - // Regression Tree builder. mod builder; @@ -18,5 +15,4 @@ mod train_node; pub use regression_tree_algorithm::RegressionTree; pub use regression_tree_regressor::RegressionTreeRegressor; -pub use loss::LossType; pub use builder::RegressionTreeBuilder; diff --git a/src/weak_learner/regression_tree/bin.rs b/src/weak_learner/regression_tree/bin.rs index 4007dd4..5782d36 100644 --- a/src/weak_learner/regression_tree/bin.rs +++ b/src/weak_learner/regression_tree/bin.rs @@ -18,19 +18,8 @@ const EPS: f64 = 0.001; const NUM_TOLERANCE: f64 = 1e-9; -/// A struct that stores the first/second order derivative information. -#[derive(Clone,Default)] -pub(crate) struct GradientHessian { - pub(crate) grad: f64, - pub(crate) hess: f64, -} - - -impl GradientHessian { - pub(super) fn new(grad: f64, hess: f64) -> Self { - Self { grad, hess } - } -} +type Gradient = f64; +type Hessian = f64; /// Binning: A feature processing. @@ -209,11 +198,13 @@ impl Bins { &self, indices: &[usize], feat: &Feature, - gh: &[GradientHessian], - ) -> Vec<(Bin, GradientHessian)> + gradient: &[Gradient], + hessian: &[Hessian], + ) -> Vec<(Bin, Gradient, Hessian)> { let n_bins = self.0.len(); - let mut packed = vec![GradientHessian::default(); n_bins]; + let mut grad_pack = vec![0f64; n_bins]; + let mut hess_pack = vec![0f64; n_bins]; for &i in indices { let xi = feat[i]; @@ -226,10 +217,10 @@ impl Bins { range.0.start.partial_cmp(&xi).unwrap() }) .unwrap(); - packed[pos].grad += gh[i].grad; - packed[pos].hess += gh[i].hess; + grad_pack[pos] += gradient[i]; + hess_pack[pos] += hessian[i]; } - self.remove_zero_weight_pack_and_normalize(packed) + self.remove_zero_weight_pack_and_normalize(grad_pack, hess_pack) } @@ -254,42 +245,45 @@ impl Bins { /// - fn remove_zero_weight_pack_and_normalize( &self, - pack: Vec, - ) -> Vec<(Bin, GradientHessian)> + grad_pack: Vec, + hess_pack: Vec, + ) -> Vec<(Bin, Gradient, Hessian)> { - let mut iter = self.0.iter().zip(pack); + let mut iter = self.0.iter().zip(grad_pack.into_iter().zip(hess_pack)); - let (prev_bin, mut prev_gh) = iter.next().unwrap(); + let (prev_bin, (mut prev_grad, mut prev_hess)) = iter.next().unwrap(); let mut prev_bin = Bin::new(prev_bin.0.clone()); - let mut iter = iter.filter(|(_, gh)| { - gh.grad != 0.0 || gh.hess != 0.0 + let mut iter = iter.filter(|(_, (grad, hess))| { + *grad != 0.0 || *hess != 0.0 }); // The left-most bin might have zero weight. // In this case, find the next non-zero weight bin and merge. - if prev_gh.grad == 0.0 && prev_gh.hess == 0.0 { - let (next_bin, next_gh) = iter.next().unwrap(); + if prev_grad == 0.0 && prev_hess == 0.0 { + let (next_bin, (next_grad, next_hess)) = iter.next().unwrap(); let start = prev_bin.0.start; let end = next_bin.0.end; prev_bin = Bin::new(start..end); - prev_gh = next_gh; + prev_grad = next_grad; + prev_hess = next_hess; } let mut bin_and_gh = Vec::new(); - for (next_bin, next_gh) in iter { + for (next_bin, (next_grad, next_hess)) in iter { let start = prev_bin.0.start; let end = (prev_bin.0.end + next_bin.0.start) / 2.0; let bin = Bin::new(start..end); - bin_and_gh.push((bin, prev_gh)); + bin_and_gh.push((bin, prev_grad, prev_hess)); prev_bin = Bin::new(next_bin.0.clone()); prev_bin.0.start = end; - prev_gh = next_gh; + prev_grad = next_grad; + prev_hess = next_hess; } - bin_and_gh.push((prev_bin, prev_gh)); + bin_and_gh.push((prev_bin, prev_grad, prev_hess)); bin_and_gh } @@ -310,7 +304,7 @@ impl fmt::Display for Bins { let tail = bins.last() .map(|bin| format!("{bin}")) .unwrap(); - write!(f, "{head}, ... , {tail}") + write!(f, "{head}, ... , {tail}") } else { let line = bins.iter() .map(|bin| format!("{}", bin)) @@ -335,7 +329,7 @@ impl fmt::Display for Bin { ' ' }; let start = start.abs(); - format!("{sgn}{start: >.2}") + format!("{sgn}{start: >.1}") }; let end = if self.0.end == f64::MAX { String::from("+Inf") @@ -349,9 +343,9 @@ impl fmt::Display for Bin { ' ' }; let end = end.abs(); - format!("{sgn}{end: >.2}") + format!("{sgn}{end: >.1}") }; - write!(f, "[{start}, {end})") + write!(f, "[{start: >8}, {end: >8})") } } diff --git a/src/weak_learner/regression_tree/builder.rs b/src/weak_learner/regression_tree/builder.rs index e295dc9..d0e1d80 100644 --- a/src/weak_learner/regression_tree/builder.rs +++ b/src/weak_learner/regression_tree/builder.rs @@ -1,6 +1,7 @@ use crate::{Sample, RegressionTree}; use super::bin::*; -use super::loss::*; + +use crate::common::loss_functions::LossFunction; use std::collections::HashMap; @@ -28,7 +29,7 @@ pub const DEFAULT_LAMBDA_L2: f64 = 0.01; /// .build(); /// ``` #[derive(Clone)] -pub struct RegressionTreeBuilder<'a> { +pub struct RegressionTreeBuilder<'a, L> { sample: &'a Sample, /// Number of bins per feature. n_bins: HashMap<&'a str, usize>, @@ -41,11 +42,11 @@ pub struct RegressionTreeBuilder<'a> { lambda_l2: f64, /// Loss function - loss: LossType, + loss: Option, } -impl<'a> RegressionTreeBuilder<'a> { +impl<'a, L> RegressionTreeBuilder<'a, L> { /// Construct a new instance of `RegressionTreeBuilder`. /// By default, /// `RegressionTreeBuilder` sets the parameters as follows; @@ -66,15 +67,15 @@ impl<'a> RegressionTreeBuilder<'a> { let lambda_l2 = DEFAULT_LAMBDA_L2; - let loss = LossType::L2; + let loss = None; Self { sample, n_bins, max_depth, loss, lambda_l2, } } /// Specify the loss type. Default is `LossType::L2`. - pub fn loss(mut self, loss: LossType) -> Self { - self.loss = loss; + pub fn loss(mut self, loss: L) -> Self { + self.loss = Some(loss); self } @@ -108,11 +109,15 @@ impl<'a> RegressionTreeBuilder<'a> { }, } } +} +impl<'a, L> RegressionTreeBuilder<'a, L> + where L: LossFunction, +{ /// Build a `RegressionTree`. /// This method consumes `self`. - pub fn build(self) -> RegressionTree<'a> { + pub fn build(self) -> RegressionTree<'a, L> { let bins = self.sample.features() .iter() .map(|feature| { @@ -123,10 +128,12 @@ impl<'a> RegressionTreeBuilder<'a> { }) .collect::>(); + let loss = self.loss + .expect("failed to get loss function. you need to specify a function that implements `LossFunction` trait"); let n_sample = self.sample.shape().0; let regression_tree = RegressionTree::from_components( - bins, n_sample, self.max_depth, self.lambda_l2, self.loss, + bins, n_sample, self.max_depth, self.lambda_l2, loss, ); diff --git a/src/weak_learner/regression_tree/loss.rs b/src/weak_learner/regression_tree/loss.rs deleted file mode 100644 index a3181b6..0000000 --- a/src/weak_learner/regression_tree/loss.rs +++ /dev/null @@ -1,189 +0,0 @@ -//! Defines some criterions for regression tree. -use rayon::prelude::*; -use serde::{ - Serialize, - Deserialize -}; - -use std::fmt; -use crate::Sample; -use crate::weak_learner::common::type_and_struct::*; - -use super::bin::*; - -use std::collections::HashMap; - -/// The type of loss (error) function. -#[derive(Clone, Copy, Debug, Serialize, Deserialize)] -pub enum LossType { - /// Least Absolute Error - L1, - - - /// Least Squared Error - L2, - - - /// Huber loss with parameter `delta`. - /// Huber loss maps the given scalar `z` to - /// `0.5 * z.powi(2)` if `z.abs() < delta`, - /// `delta * (z.abs() - 0.5 * delta)`, otherwise. - Huber(f64), - - - // /// Quantile loss - // Quantile(f64), -} - - -impl fmt::Display for LossType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let loss = match self { - Self::L1 => "L1 (Least Absolute) loss".to_string(), - Self::L2 => "L2 (Least Squared) loss".to_string(), - Self::Huber(delta) => format!("Huber loss (delta = {delta})"), - }; - write!(f, "{loss}") - } -} - - -impl LossType { - pub(crate) fn gradient_and_hessian( - &self, - targets: &[f64], - predictions: &[f64], - ) -> Vec - { - match self { - Self::L1 => { - targets.iter() - .zip(predictions) - .map(|(y, p)| { - let grad = (y - p).signum(); - GradientHessian::new(grad, 0.0) - }) - .collect::>() - }, - Self::L2 => { - targets.iter() - .zip(predictions) - .map(|(y, p)| { - let grad = p - y; - GradientHessian::new(grad, 1.0) - }) - .collect::>() - }, - - Self::Huber(delta) => { - targets.iter() - .zip(predictions) - .map(|(y, p)| { - let diff = p - y; - let (grad, hess) = if diff.abs() < *delta { - (diff, 1.0) - } else { - (delta * diff.signum(), 0.0) - }; - - GradientHessian::new(grad, hess) - }) - .collect::>() - }, - } - } - - - /// Returns the best splitting rule based on the loss function. - pub(super) fn best_split<'a>( - &self, - bins_map: &HashMap<&'_ str, Bins>, - sample: &'a Sample, - gh: &[GradientHessian], - idx: &[usize], - lambda_l2: f64, - ) -> (&'a str, Threshold) - { - - sample.features() - .par_iter() - .map(|feature| { - let name = feature.name(); - let bin = bins_map.get(name).unwrap(); - let pack = bin.pack(idx, feature, gh); - let (score, threshold) = self.best_split_at(pack, lambda_l2); - - (score, name, threshold) - }) - .max_by(|x, y| x.0.partial_cmp(&y.0).unwrap()) - .map(|(_, name, threshold)| (name, threshold)) - .expect("No feature that maximizes the score.") - } - - - fn best_split_at( - &self, - pack: Vec<(Bin, GradientHessian)>, - lambda_l2: f64, - ) -> (LossValue, Threshold) - { - let mut right_grad_sum = pack.par_iter() - .map(|(_, gh)| gh.grad) - .sum::(); - let mut right_hess_sum = pack.par_iter() - .map(|(_, gh)| gh.hess) - .sum::(); - - - let mut left_grad_sum = 0.0; - let mut left_hess_sum = 0.0; - - - let mut best_score = f64::MIN; - let mut best_threshold = f64::MIN; - - - for (bin, gh) in pack { - left_grad_sum += gh.grad; - left_hess_sum += gh.hess; - right_grad_sum -= gh.grad; - right_hess_sum -= gh.hess; - - - let score = - left_grad_sum.powi(2) / (left_hess_sum + lambda_l2) - + right_grad_sum.powi(2) / (right_hess_sum + lambda_l2); - if best_score < score { - best_score = score; - best_threshold = bin.0.end; - } - } - - (best_score.into(), best_threshold.into()) - } - - - pub(super) fn prediction_and_loss( - &self, - indices: &[usize], - gh: &[GradientHessian], - lambda_l2: f64, - ) -> (Prediction, LossValue) - { - let grad_sum = indices.par_iter() - .map(|&i| gh[i].grad) - .sum::(); - - let hess_sum = indices.par_iter() - .map(|&i| gh[i].hess) - .sum::(); - - let prediction = - grad_sum / (hess_sum + lambda_l2); - let loss_value = -0.5 * grad_sum.powi(2) / (hess_sum + lambda_l2); - - (prediction.into(), loss_value.into()) - } - -} - - diff --git a/src/weak_learner/regression_tree/regression_tree_algorithm.rs b/src/weak_learner/regression_tree/regression_tree_algorithm.rs index b8b700b..b5774f2 100644 --- a/src/weak_learner/regression_tree/regression_tree_algorithm.rs +++ b/src/weak_learner/regression_tree/regression_tree_algorithm.rs @@ -1,15 +1,19 @@ use crate::{Sample, WeakLearner}; use super::bin::*; +use crate::common::loss_functions::LossFunction; + use crate::weak_learner::common::{ split_rule::*, + type_and_struct::*, }; +use rayon::prelude::*; + use super::{ node::*, train_node::*, - loss::LossType, regression_tree_regressor::RegressionTreeRegressor, }; @@ -20,6 +24,10 @@ use std::cell::RefCell; use std::collections::HashMap; +type Gradient = f64; +type Hessian = f64; + + /// `RegressionTree` is the factory that generates /// a `RegressionTreeClassifier` for a given distribution over examples. /// @@ -42,7 +50,7 @@ use std::collections::HashMap; /// // Further, this example uses `Criterion::Edge` for splitting rule. /// let tree = RegressionTreeBuilder::new(&sample) /// .max_depth(2) -/// .loss(LossType::L2) +/// .loss(GBMLoss::L2) /// .build(); /// /// let n_sample = sample.shape().0; @@ -59,7 +67,7 @@ use std::collections::HashMap; /// / n_sample as f64; /// println!("loss (train) is: {loss}"); /// ``` -pub struct RegressionTree<'a> { +pub struct RegressionTree<'a, L> { bins: HashMap<&'a str, Bins>, // The maximal depth of the output trees max_depth: usize, @@ -70,23 +78,22 @@ pub struct RegressionTree<'a> { // Regularization parameter lambda_l2: f64, - - // LossType function - loss_type: LossType, + // Loss function + loss_func: L, } -impl<'a> RegressionTree<'a> { +impl<'a, L> RegressionTree<'a, L> { #[inline] pub(super) fn from_components( bins: HashMap<&'a str, Bins>, n_sample: usize, max_depth: usize, lambda_l2: f64, - loss_type: LossType, + loss_func: L, ) -> Self { - Self { bins, n_sample, max_depth, lambda_l2, loss_type, } + Self { bins, n_sample, max_depth, lambda_l2, loss_func, } } @@ -94,15 +101,16 @@ impl<'a> RegressionTree<'a> { fn full_tree( &self, sample: &Sample, - gh: &[GradientHessian], + gradient: &[Gradient], + hessian: &[Hessian], indices: Vec, max_depth: usize, ) -> Rc> { // Compute the best prediction that minimizes the training error // on this node. - let (pred, loss) = self.loss_type.prediction_and_loss( - &indices, gh, self.lambda_l2, + let (pred, loss) = prediction_and_loss( + &indices[..], gradient, hessian, self.lambda_l2, ); @@ -113,8 +121,8 @@ impl<'a> RegressionTree<'a> { // Find the best splitting rule. - let (feature, threshold) = self.loss_type.best_split( - &self.bins, &sample, gh, &indices[..], self.lambda_l2, + let (feature, threshold) = best_split( + &self.bins, sample, gradient, hessian, &indices[..], self.lambda_l2, ); let rule = Splitter::new(feature, threshold); @@ -139,8 +147,8 @@ impl<'a> RegressionTree<'a> { // ----- // At this point, `max_depth > 1` is guaranteed // so that one can grow the tree. - let ltree = self.full_tree(sample, gh, lindices, max_depth-1); - let rtree = self.full_tree(sample, gh, rindices, max_depth-1); + let ltree = self.full_tree(sample, gradient, hessian, lindices, max_depth-1); + let rtree = self.full_tree(sample, gradient, hessian, rindices, max_depth-1); TrainNode::branch(rule, ltree, rtree, pred, loss) @@ -148,7 +156,9 @@ impl<'a> RegressionTree<'a> { } -impl<'a> WeakLearner for RegressionTree<'a> { +impl<'a, L> WeakLearner for RegressionTree<'a, L> + where L: LossFunction, +{ type Hypothesis = RegressionTreeRegressor; @@ -165,30 +175,29 @@ impl<'a> WeakLearner for RegressionTree<'a> { let info = Vec::from([ ("# of bins (max)", format!("{n_bins}")), ("Max depth", format!("{}", self.max_depth)), - ("Split criterion", format!("{}", self.loss_type)), + ("Split criterion", format!("{}", self.loss_func.name())), ("Regularization param.", format!("{}", self.lambda_l2)), ]); Some(info) } - fn produce(&self, sample: &Sample, predictions: &[f64]) -> Self::Hypothesis { - let gh = self.loss_type.gradient_and_hessian( - sample.target(), - predictions, - ); - + let gradient = self.loss_func.gradient(predictions, sample.target()); + let hessian = self.loss_func.hessian(predictions, sample.target()); - let indices = (0..self.n_sample).filter(|&i| { - gh[i].grad != 0.0 || gh[i].hess != 0.0 - }) - .collect::>(); + let indices = (0..self.n_sample).collect::>(); - let tree = self.full_tree(sample, &gh, indices, self.max_depth); + let tree = self.full_tree( + sample, + &gradient[..], + &hessian[..], + indices, + self.max_depth, + ); let root = Node::from( Rc::try_unwrap(tree) @@ -200,9 +209,9 @@ impl<'a> WeakLearner for RegressionTree<'a> { } } - - -impl fmt::Display for RegressionTree<'_> { +impl fmt::Display for RegressionTree<'_, L> + where L: LossFunction, +{ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!( f, @@ -214,7 +223,7 @@ impl fmt::Display for RegressionTree<'_> { - Bins:\ ", self.max_depth, - self.loss_type, + self.loss_func.name(), )?; @@ -242,3 +251,99 @@ impl fmt::Display for RegressionTree<'_> { write!(f, "----------") } } + + +/// Returns the best splitting rule based on the loss function. +fn best_split<'a>( + bins_map: &HashMap<&'_ str, Bins>, + sample: &'a Sample, + gradient: &[Gradient], + hessian: &[Hessian], + idx: &[usize], + lambda_l2: f64, +) -> (&'a str, Threshold) +{ + + sample.features() + .par_iter() + .map(|feature| { + let name = feature.name(); + let bin = bins_map.get(name).unwrap(); + let pack = bin.pack(idx, feature, gradient, hessian); + let (score, threshold) = best_split_at(pack, lambda_l2); + + (score, name, threshold) + }) + .max_by(|x, y| x.0.partial_cmp(&y.0).unwrap()) + .map(|(_, name, threshold)| (name, threshold)) + .expect("No feature that maximizes the score.") +} + +/// this code is implemented based on Algorithm 3 of the following paper: +/// Tianqi Chen and Carlos Guestrin. +/// XGBoost: A scalable tree boosting system [KDD '16] +fn best_split_at( + pack: Vec<(Bin, Gradient, Hessian)>, + lambda_l2: f64, +) -> (LossValue, Threshold) +{ + let mut right_grad_sum = pack.par_iter() + .map(|(_, grad, _)| grad) + .sum::(); + let mut right_hess_sum = pack.par_iter() + .map(|(_, _, hess)| hess) + .sum::(); + + + let mut left_grad_sum = 0.0; + let mut left_hess_sum = 0.0; + + + let mut best_score = f64::MIN; + let mut best_threshold = f64::MIN; + + + for (bin, grad, hess) in pack { + left_grad_sum += grad; + left_hess_sum += hess; + right_grad_sum -= grad; + right_hess_sum -= hess; + + + let score = + left_grad_sum.powi(2) / (left_hess_sum + lambda_l2) + + right_grad_sum.powi(2) / (right_hess_sum + lambda_l2); + if best_score < score { + best_score = score; + best_threshold = bin.0.end; + } + } + + (best_score.into(), best_threshold.into()) +} + +/// returns the prediction value and the loss value of a leaf. +/// this function is implemented based on Eqs. (5), (6) of the following paper: +/// Tianqi Chen and Carlos Guestrin. +/// XGBoost: A scalable tree boosting system [KDD '16] +fn prediction_and_loss( + indices: &[usize], + gradient: &[Gradient], + hessian: &[Hessian], + lambda_l2: f64, +) -> (Prediction, LossValue) +{ + let grad_sum = indices.par_iter() + .map(|&i| gradient[i]) + .sum::(); + + let hess_sum = indices.par_iter() + .map(|&i| hessian[i]) + .sum::(); + + let prediction = - grad_sum / (hess_sum + lambda_l2); + let loss_value = -0.5 * grad_sum.powi(2) / (hess_sum + lambda_l2); + + (prediction.into(), loss_value.into()) +} + diff --git a/tests/gbm.rs b/tests/gbm.rs index 9bfd1b9..6b221a5 100644 --- a/tests/gbm.rs +++ b/tests/gbm.rs @@ -8,8 +8,9 @@ pub mod gbm_boston { use super::*; #[test] fn boston() { + let file = "california-housing.csv"; let mut path = env::current_dir().unwrap(); - path.push("tests/dataset/california-housing.csv"); + path.push(format!("tests/dataset/{file}")); let sample = SampleReader::new() .file(path) @@ -21,11 +22,10 @@ pub mod gbm_boston { let n_sample = sample.shape().0 as f64; - let mut gbm = GBM::init_with_loss(&sample, GBMLoss::L2) - .loss(GBMLoss::L2); + let mut gbm = GBM::init_with_loss(&sample, GBMLoss::L2); let tree = RegressionTreeBuilder::new(&sample) .max_depth(3) - .loss(LossType::L2) + .loss(GBMLoss::L2) .build(); println!("{tree}"); @@ -40,24 +40,24 @@ pub mod gbm_boston { .zip(&predictions[..]) .map(|(t, p)| (t - p).abs()) .sum::() / n_sample; - println!("L1-Loss (boston_housing.csv, GBM, RegressionTree): {loss}"); + println!("L1-Loss ({file}, GBM, RegressionTree): {loss}"); let loss = target.iter() .copied() .zip(&predictions[..]) - .map(|(t, p)| (t - p).powi(2)) + .map(|(t, p)| (t - p).abs()) .sum::() / n_sample; - println!("L2-Loss (boston_housing.csv, GBM, RegressionTree): {loss}"); + println!("L2-Loss ({file}, GBM, RegressionTree): {loss}"); let mean = target.iter().sum::() / n_sample; let loss = target.iter() .copied() - .map(|y| (y - mean).powi(2)) + .map(|y| (y - mean).abs()) .sum::() / n_sample; - println!("Loss (mean prediction): {loss}"); + println!("Loss ({file}, mean prediction): {loss}"); assert!(true); }