Skip to content

Commit

Permalink
Merge pull request #250 from Evovest/shuffling
Browse files Browse the repository at this point in the history
Shuffling
  • Loading branch information
jeremiedb authored Aug 12, 2023
2 parents 897ffca + a70d925 commit 46e9caa
Show file tree
Hide file tree
Showing 10 changed files with 202 additions and 54 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <jeremie.db@evovest.com>"]
version = "0.15.2"
version = "0.16.0"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
4 changes: 4 additions & 0 deletions benchmarks/regressor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ import CUDA
### perf depth
# desktop | 1e6 | depth 11 | cpu: 28s gpu: 73 sec | xgboost: 26s
# desktop | 10e6 | depth 11 | cpu 205s gpu: 109 sec | xgboost 260s

#threads
# laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB)

nobs = Int(1e6)
num_feat = Int(100)
nrounds = 200
Expand Down
144 changes: 144 additions & 0 deletions experiments/shuffling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
using DataFrames
using Distributions
using EvoTrees
using LinearAlgebra
using GLM
using Random

δ = 1.0e-6
b = fill(1.0 - δ, 3, 3) + δ * I
z = zeros(3, 3)
y = fill(0.5, 3)
dist = MvNormal([
b z z 0.8*y
z b z y
z z b 1.2*y
0.8*y' y' 1.2*y' 1.0])
Random.seed!(1)
mat = rand(dist, 10_000);
df = DataFrame(transpose(mat), [string.("x", 1:9); "y"]);
target_name = "y"

#################################
# Tables API
#################################
config = EvoTreeRegressor(seed=123)
m1 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m1)

config = EvoTreeRegressor(seed=124)
m2 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m2)

# permuted tables doesn't return the same result - numerical rounding error?
df2 = df[!, 10:-1:1]
config = EvoTreeRegressor()
m3 = fit_evotree(config,
df2;
target_name="y",
verbosity=0);
EvoTrees.importance(m3)

# manual check on col permutations
config = EvoTreeRegressor(max_depth=4)
m1, cache1 = EvoTrees.init(config, df; target_name);
EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU)
EvoTrees.importance(m1)

df2 = df[!, 10:-1:1];
config = EvoTreeRegressor(max_depth=4)
m2, cache2 = EvoTrees.init(config, df2; target_name);
EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU)
EvoTrees.importance(m2)

all(cache1.x_bin .== cache2.x_bin[:, 9:-1:1])
all(cache1.edges .== cache2.edges[9:-1:1])
m1.trees[2]
m2.trees[2]

m1.trees[2].feat
m2.trees[2].feat

Int.(m1.trees[2].cond_bin)
Int.(m2.trees[2].cond_bin)


config = EvoTreeRegressor(nrounds=100, eta=0.05, colsample=1.0)
m3 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m3)

#################################
# Tables API
#################################
config = EvoTreeRegressor(colsample=0.5)
m1 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m1)

m2 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m2)

#################################
# Matrix API
#################################
x_train = Matrix(mat[1:9, :]')
y_train = mat[10, :]

config = EvoTreeRegressor()
m1 = fit_evotree(config;
x_train,
y_train,
verbosity=0);
EvoTrees.importance(m1)

m2 = fit_evotree(config;
x_train,
y_train,
verbosity=0);
EvoTrees.importance(m2)

using GLM
x_train = Matrix(mat[1:9, :]')
y_train = mat[10, :]
lm(x_train, y_train)

#################################
# Matrix debug API
#################################
x_train = Matrix(mat[1:9, :]')
y_train = mat[10, :]

config = EvoTreeRegressor()
m1, cache1 = EvoTrees.init(config, x_train, y_train);
EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU)
EvoTrees.importance(m1)

m2, cache2 = EvoTrees.init(config, x_train, y_train);
EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU)
EvoTrees.importance(m2)

using MLJ
using EvoTrees
using MLJLinearModels
X, y = make_regression()
model = Stack(
metalearner=LinearRegressor(),
resampling=CV(nfolds=2),
tree=EvoTreeRegressor()
)
mach = machine(model, X, y)
fit!(mach)
20 changes: 10 additions & 10 deletions src/eval.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ function mse(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] = w[i] * (p[1, i] - y[i])^2
end
return sum(eval) / sum(w)
Expand All @@ -20,7 +20,7 @@ function mae(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] = w[i] * abs(p[1, i] - y[i])
end
return sum(eval) / sum(w)
Expand All @@ -33,7 +33,7 @@ function logloss(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
pred = sigmoid(p[1, i])
eval[i] = w[i] * (-y[i] * log(pred) + (y[i] - 1) * log(1 - pred))
end
Expand All @@ -48,7 +48,7 @@ function mlogloss(
kwargs...
) where {T}
K = size(p, 1)
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
isum = zero(T)
@inbounds for k in 1:K
isum += exp(p[k, i])
Expand All @@ -65,7 +65,7 @@ function poisson(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
pred = exp(p[1, i])
eval[i] = w[i] * 2 * (y[i] * (log(y[i]) - log(pred)) + pred - y[i])
end
Expand All @@ -79,7 +79,7 @@ function gamma(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
pred = exp(p[1, i])
eval[i] = w[i] * 2 * (log(pred / y[i]) + y[i] / pred - 1)
end
Expand All @@ -94,7 +94,7 @@ function tweedie(
kwargs...
) where {T}
rho = T(1.5)
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
pred = exp(p[1, i])
eval[i] =
w[i] *
Expand All @@ -114,7 +114,7 @@ function gaussian_mle(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] = -w[i] * (p[2, i] + (y[i] - p[1, i])^2 / (2 * exp(2 * p[2, i])))
end
return sum(eval) / sum(w)
Expand All @@ -127,7 +127,7 @@ function logistic_mle(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] = w[i] * (log(1 / 4 * sech(exp(-p[2, i]) * (y[i] - p[1, i]))^2) - p[2, i])
end
return sum(eval) / sum(w)
Expand All @@ -141,7 +141,7 @@ function wmae(
alpha=0.5,
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] =
w[i] * (
alpha * max(y[i] - p[1, i], zero(T)) +
Expand Down
18 changes: 9 additions & 9 deletions src/fit-utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ Get the braking points of the feature data.
"""
function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T}
nobs = min(size(X, 1), 1000 * nbins)
idx = rand(rng, 1:size(X, 1), nobs)
idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true)
nfeats = size(X, 2)
edges = Vector{Vector{T}}(undef, nfeats)
featbins = Vector{UInt8}(undef, nfeats)
feattypes = Vector{Bool}(undef, nfeats)
@threads :static for j in 1:size(X, 2)
@threads for j in 1:size(X, 2)
edges[j] = quantile(view(X, idx, j), (1:nbins-1) / nbins)
if length(edges[j]) == 1
edges[j] = [minimum(view(X, idx, j))]
Expand All @@ -25,12 +25,12 @@ end
function get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG())
_nobs = length(Tables.getcolumn(df, 1))
nobs = min(_nobs, 1000 * nbins)
idx = rand(rng, 1:_nobs, nobs)
idx = sample(rng, 1:_nobs, nobs, replace=false, ordered=true)
edges = Vector{Any}([Vector{eltype(Tables.getcolumn(df, col))}() for col in fnames])
nfeats = length(fnames)
featbins = Vector{UInt8}(undef, nfeats)
feattypes = Vector{Bool}(undef, nfeats)
@threads :static for j in eachindex(fnames)
@threads for j in eachindex(fnames)
col = view(Tables.getcolumn(df, fnames[j]), idx)
if eltype(col) <: Bool
edges[j] = [false, true]
Expand Down Expand Up @@ -63,7 +63,7 @@ Transform feature data into a UInt8 binarized matrix.
"""
function binarize(X::AbstractMatrix; fnames, edges)
x_bin = zeros(UInt8, size(X))
@threads :static for j in axes(X, 2)
@threads for j in axes(X, 2)
x_bin[:, j] .= searchsortedfirst.(Ref(edges[j]), view(X, :, j))
end
return x_bin
Expand All @@ -72,7 +72,7 @@ end
function binarize(df; fnames, edges)
nobs = length(Tables.getcolumn(df, 1))
x_bin = zeros(UInt8, nobs, length(fnames))
@threads :static for j in eachindex(fnames)
@threads for j in eachindex(fnames)
col = Tables.getcolumn(df, fnames[j])
if eltype(col) <: Bool
x_bin[:, j] .= col .+ 1
Expand Down Expand Up @@ -232,7 +232,7 @@ function update_hist!(
is::AbstractVector,
js::AbstractVector,
) where {L<:GradientRegression}
@threads :static for j in js
@threads for j in js
@inbounds @simd for i in is
bin = x_bin[i, j]
hist[j][1, bin] += ∇[1, i]
Expand All @@ -255,7 +255,7 @@ function update_hist!(
is::AbstractVector,
js::AbstractVector,
) where {L<:MLE2P}
@threads :static for j in js
@threads for j in js
@inbounds @simd for i in is
bin = x_bin[i, j]
hist[j][1, bin] += ∇[1, i]
Expand All @@ -280,7 +280,7 @@ function update_hist!(
is::AbstractVector,
js::AbstractVector,
) where {L}
@threads :static for j in js
@threads for j in js
@inbounds for i in is
bin = x_bin[i, j]
@inbounds @simd for k in axes(∇, 1)
Expand Down
4 changes: 2 additions & 2 deletions src/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ function grow_tree!(
update_hist!(L, nodes[n].h, ∇, x_bin, nodes[n].is, js)
end
end
@threads :static for n sort(n_current)
@threads for n sort(n_current)
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end
end
Expand Down Expand Up @@ -215,7 +215,7 @@ function grow_otree!(
update_hist!(L, nodes[n].h, ∇, x_bin, nodes[n].is, js)
end
end
@threads :static for n n_current
@threads for n n_current
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end

Expand Down
4 changes: 2 additions & 2 deletions src/gpu/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ function grow_tree!(
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
@threads :static for n sort(n_current)
@threads for n sort(n_current)
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end
end
Expand Down Expand Up @@ -217,7 +217,7 @@ function grow_otree!(
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
@threads :static for n n_current
@threads for n n_current
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end

Expand Down
Loading

2 comments on commit 46e9caa

@jeremiedb
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/89502

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.16.0 -m "<description of version>" 46e9caa681bacc6b3f38e18545334f3ca2a8230c
git push origin v0.16.0

Please sign in to comment.