Skip to content

Commit

Permalink
Merge pull request #31 from gdkrmr/ND-Histograms
Browse files Browse the repository at this point in the history
Nd histograms
  • Loading branch information
meggart authored Nov 28, 2019
2 parents d98b9c2 + 69db6aa commit f4c9b58
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 40 deletions.
6 changes: 5 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
name = "WeightedOnlineStats"
uuid = "bbac0a1f-7c9d-5672-960b-c6ca726e5d5d"
authors = ["Guido Kraemer <gdkrmr@users.noreply.github.com>", "Martin Gutwin <twinGu@users.noreply.github.com>"]
version = "0.3.2"
version = "0.4.0"

[compat]
julia = "1"
MultivariateStats = "0.7"
OnlineStats = "1"
OnlineStatsBase = "1"
StatsBase = "0.30,0.31,0.32"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
149 changes: 120 additions & 29 deletions src/histogram.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
# Modifying it to work with WeightedOnlineStats
##############################################################

import LinearAlgebra
abstract type WeightedHistogramStat{T} <: WeightedOnlineStat{T} end
abstract type WeightedHist{T} <: WeightedHistogramStat{T} end
split_candidates(o::WeightedHistogramStat) = midpoints(o)
Statistics.mean(o::WeightedHistogramStat) = mean(midpoints(o), fweights(counts(o)))
Statistics.var(o::WeightedHistogramStat) = var(midpoints(o), fweights(counts(o)); corrected=true)
Statistics.std(o::WeightedHistogramStat) = sqrt(var(o))
Statistics.std(o::WeightedHistogramStat) = sqrt.(var(o))
Statistics.median(o::WeightedHistogramStat) = quantile(o, .5)

function Base.show(io::IO, o::WeightedHistogramStat)
Expand All @@ -25,7 +27,12 @@ Create a histogram with bin partition defined by `edges`.
- If `left`, the bins will be left-closed.
- If `closed`, the bin on the end will be closed.
- E.g. for a two bin histogram ``[a, b), [b, c)`` vs. ``[a, b), [b, c]``
# Example
If `edges` is a tuple instead of an array, a multidimensional histogram will be
generated that behaves like a `WeightedOnlineStat{VectorOb}`.
# Examples
o = fit!(WeightedHist(-5:.1:5), randn(10^6))
# approximate statistics
Expand All @@ -38,68 +45,152 @@ Create a histogram with bin partition defined by `edges`.
extrema(o)
area(o)
pdf(o)
## 2d Histogram
hist2d = fit!(WeightedHist((-5:1:5, -5:1:5) ), randn(10000,2), rand(10000))
value(hist2d).y
"""
struct WeightedHist{T, R} <: WeightedHistogramStat{T}
struct WeightedHist1D{R} <: WeightedHist{Float64}
edges::R
counts::Vector{Int}
meanw::Vector{Float64}
outcount::Vector{Int}
meanwout::Vector{Float64}
left::Bool
closed::Bool
end
struct WeightedHistND{R, N} <: WeightedHist{OnlineStats.VectorOb}
edges::R
counts::Vector{Float64}
out::Vector{Float64}
counts::Array{Int,N}
meanw::Array{Float64,N}
outcount::Array{Int,N}
meanwout::Array{Float64,N}
left::Bool
closed::Bool
end

function WeightedHist(edges::R, T::Type = eltype(edges); left::Bool=true, closed::Bool = true) where {R<:AbstractVector}
new{T,R}(edges, zeros(Int, length(edges) - 1), [0,0], left, closed)
function WeightedHist(edges; left::Bool=true, closed::Bool = true)
edges = isa(edges,Tuple) ? edges : (edges,)
counts = zeros(Int, map(i->length(i)-1, edges))
meanw = zeros(Float64, map(i->length(i)-1, edges))
outcount = zeros(Int,ntuple(_->3,length(edges)))
meanwout = zeros(Float64,ntuple(_->3,length(edges)))
if length(edges) == 1
WeightedHist1D(edges[1],counts,meanw,outcount,meanwout,left,closed)
else
WeightedHistND{typeof(edges),length(edges)}(edges, counts, meanw,outcount,meanwout, left, closed)
end
end
nobs(o::WeightedHist) = sum(o.counts) + sum(o.out)
weightsum(o::WeightedHist) = nobs(o)
value(o::WeightedHist) = (x=o.edges, y=o.counts)

midpoints(o::WeightedHist) = midpoints(o.edges)
# Special case for 1D Histogram
nobs(o::WeightedHist) = sum(o.counts) + sum(o.outcount)
weightsum(o::WeightedHist) = LinearAlgebra.dot(o.counts, o.meanw) + LinearAlgebra.dot(o.outcount,o.meanwout)
value(o::WeightedHist) = (x=edges(o), y=o.counts .* o.meanw)
binindices(o::WeightedHistND{<:Any,N}, x::AbstractVector) where N = binindices(o, ntuple(i->x[i],N))
binindices(o::WeightedHist1D,x) = OnlineStats.binindex(o.edges, x, o.left, o.closed)
binindices(o::WeightedHistND,x) = CartesianIndex(map((e,ix)->OnlineStats.binindex(e, ix, o.left, o.closed), o.edges, x))
midpoints(o::WeightedHistND) = Iterators.product(map(midpoints,o.edges)...)
midpoints(o::WeightedHist1D) = midpoints(edges(o))
counts(o::WeightedHist) = o.counts
edges(o::WeightedHist) = o.edges
function Statistics.mean(o::WeightedHist)
weights = value(o).y
N = ndims(o.counts)
r = ntuple(N) do idim
a = map(i->i[idim],midpoints(o))
mean(a,fweights(weights))
end
N==1 ? r[1] : r
end
function Statistics.var(o::WeightedHist)
weights = value(o).y
N = ndims(o.counts)
r = ntuple(N) do idim
a = map(i->i[idim],midpoints(o))
var(a,fweights(weights),corrected=true)
end
N==1 ? r[1] : r
end
Statistics.std(o::WeightedHist) = sqrt.(var(o))
Statistics.median(o::WeightedHist) = quantile(o, .5)

function Base.extrema(o::WeightedHist)
function Base.extrema(o::WeightedHist1D)
x, y = midpoints(o), counts(o)
x[findfirst(!iszero,y)],x[findlast(!iszero,y)]
end
function Base.extrema(o::WeightedHistND{<:Any,N}) where N
x, y = midpoints(o), counts(o)
x[findfirst(x -> x > 0, y)], x[findlast(x -> x > 0, y)]
ntuple(N) do idim
avalue = any(!iszero, y, dims = setdiff(1:N,idim))[:]
x.iterators[idim][findfirst(avalue)],x.iterators[idim][findlast(avalue)]
end
end

function Statistics.quantile(o::WeightedHist, p = [0, .25, .5, .75, 1])
x, y = midpoints(o), counts(o)
inds = findall(x -> x != 0, y)
quantile(x[inds], fweights(y[inds]), p)
N = ndims(y)
inds = findall(!iszero, y)
yweights = fweights(y[inds])
subset = collect(x)[inds]
r = ntuple(N) do idim
data = map(i->i[idim],subset)
quantile(data, fweights(y[inds]), p)
end
if N==1
return r[1]
else
return r
end
end

function area(o::WeightedHist)
c = o.counts
e = o.edges
if isa(e, AbstractRange)
return step(e) * sum(c)
else
return sum((e[i+1] - e[i]) * c[i] for i in 1:length(c))
return mapreduce(+, CartesianIndices(c)) do I
ar = prod(map((ed,i)->ed[i+1]-ed[i],e,I.I))
c[I]*ar
end
end

outindex(o, ci::CartesianIndex) = CartesianIndex(map((i,l)->i < 1 ? 1 : i > l ? 3 : 2, ci.I, size(o.counts)))
outindex(o, ci::Int) = CartesianIndex(ci < 1 ? 1 : ci > length(o.counts) ? 3 : 2)
function pdf(o::WeightedHist, y)
i = OnlineStats.binindex(o.edges, y, o.left, o.closed)
if i < 1 || i > length(o.counts)
return 0.0
ci = binindices(o, y)
if all(isequal(2),outindex(o,ci).I)
return o.counts[ci]*o.meanw[ci] / area(o) / weightsum(o)
else
return o.counts[i] / area(o)
return 0.0
end
end

function _fit!(o::WeightedHist, x, wt)
i = OnlineStats.binindex(o.edges, x, o.left, o.closed)
if 1 i < length(o.edges)
o.counts[i] += wt
#length(x) == N || error("You must provide $(N) values for the histogram")
ci = binindices(o, x)
oi = outindex(o,ci)
if all(isequal(2),oi.I)
o.counts[ci] += 1
o.meanw[ci] = smooth(o.meanw[ci], wt, 1.0 / o.counts[ci])
else
o.out[1 + (i > 0)] += wt
o.outcount[oi] += 1
o.meanwout[oi] = smooth(o.meanwout[oi], wt, 1.0 / o.outcount[oi])
end
end

function _merge!(o::WeightedHist, o2::WeightedHist)
if o.edges == o2.edges
for j in eachindex(o.counts)
o.counts[j] += o2.counts[j]
newcount = o.counts[j] + o2.counts[j]
if newcount > 0
o.meanw[j] = (o.meanw[j]*o.counts[j] + o2.meanw[j]*o2.counts[j])/newcount
end
o.counts[j] = newcount
end
for j in eachindex(o.outcount)
newcount = o.outcount[j] + o2.outcount[j]
if newcount > 0
o.meanwout[j] = (o.meanwout[j]*o.outcount[j] + o2.meanwout[j]*o2.outcount[j])/newcount
end
o.outcount[j] = newcount
end
else
@warn("WeightedHistogram edges do not align. Merging is approximate.")
Expand Down
59 changes: 49 additions & 10 deletions test/test_hist.jl
Original file line number Diff line number Diff line change
Expand Up @@ -86,23 +86,28 @@ end
h = WeightedHist(-3:1:1)

fit!(h, -2.5,1.3)
@test h.counts == [1.3, 0,0,0]
@test h.counts == [1,0,0,0]
@test h.meanw == [1.3,0,0,0]

fit!(h, (-2.1, 1.0))
@test h.counts == [2.3, 0,0,0]
@test value(h).y == [2.3, 0,0,0]
@test h.counts == [2,0,0,0]
@test h.meanw == [1.15,0,0,0]

fit!(h, (-20, 2.0))
@test h.counts == [2.3, 0,0,0]
@test h.out == [2.0, 0]
@test value(h).y == [2.3, 0,0,0]
@test h.outcount == [1, 0, 0]
@test h.meanwout == [2.0,0,0]

fit!(h, 20, 1.7)
@test h.counts == [2.3, 0,0,0]
@test h.out == [2.0, 1.7]
@test value(h).y == [2.3, 0,0,0]
@test h.meanwout == [2.0, 0.0, 1.7]
@test h.outcount == [1, 0, 1]


fit!(h, -0.1, 1.1)
@test h.counts == [2.3, 0,1.1,0]
@test value(h).y == [2.3, 0,1.1,0]
@test h.edges === -3:1:1
@test h.out == [2.0, 1.7]
end

@testset "merge!" begin
Expand All @@ -111,7 +116,7 @@ end
fit!(h1, (1, 10))
h1_copy = deepcopy(h1)
@test merge!(h1, WeightedHist([-2,0,2])) == h1_copy
@test merge!(h1_copy, h1).counts == [10, 20.]
@test value(merge!(h1_copy, h1)).y == [10, 20.]
end

@testset "stats" begin
Expand All @@ -122,12 +127,46 @@ end
fit!(h, x, 1.0)
fit!(ho, x)
end

@test mean(h) mean(ho)
@test std(h) std(ho)
@test median(h) median(ho)
@test nobs(h) nobs(ho)
@test var(h) var(ho)
@test all(extrema(h) .≈ extrema(ho))
end

@testset "N-dimensional Hist" begin
h = WeightedHist((-2:2:2,0:3:6))
fit!(h,(-1.5,1.5),1.5)
@test h.counts == [1 0; 0 0]
@test h.meanw == [1.5 0; 0 0]

fit!(h,(-0.5,0.2),1.1)
@test h.counts == [2 0; 0 0]
@test h.meanw == [1.3 0;0 0]

fit!(h,(-3.0,0.0),1.5)
@test h.counts == [2 0; 0 0]
@test h.meanw == [1.3 0;0 0]
@test h.outcount == [0 1 0; 0 0 0; 0 0 0]

fit!(h,(-10,-10),1.1)
@test h.counts == [2 0; 0 0]
@test h.meanw == [1.3 0;0 0]
@test h.outcount == [1 1 0; 0 0 0; 0 0 0]

fit!(h,(1.5,4.5),2.6)
@test h.counts == [2 0; 0 1]
@test h.meanw == [1.3 0; 0 2.6]
@test value(h) == (x=(-2:2:2, 0:3:6),y=[2.6 0;0 2.6])

@test mean(h) == (0.0,3.0)
@test var(h) == (1.2380952380952381, 2.785714285714286)
@test std(h) == (1.1126972805283737, 1.6690459207925605)
@test median(h) == (-1.0, 1.5)
@test nobs(h) == 5
@test extrema(h) == ((-1,1),(1.5,4.5))
end


end

2 comments on commit f4c9b58

@meggart
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/5979

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if Julia TagBot is installed, or can be done manually through the github interface, or via:

git tag -a v0.4.0 -m "<description of version>" f4c9b58d9a0ad745f2c45d99017f0abc298ce6c9
git push origin v0.4.0

Please sign in to comment.