Skip to content

Commit

Permalink
Merge pull request #7 from yufongpeng/release-0.2.0
Browse files Browse the repository at this point in the history
Release 0.2.0
  • Loading branch information
yufongpeng authored Jan 19, 2023
2 parents fe647e4 + 6dfa186 commit c2b1d27
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 302 deletions.
10 changes: 4 additions & 6 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
name = "AnovaGLM"
uuid = "0a47a8e3-ec57-451e-bddb-e0be9d22772b"
authors = ["Yu-Fong Peng <sciphypar@gmail.com>"]
version = "0.1.4"
version = "0.2.0"

[deps]
AnovaBase = "946dddda-6a23-4b48-8e70-8e60d9b8d680"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand All @@ -16,11 +15,10 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"

[compat]
AnovaBase = "0.6.3"
DataFrames = "1"
AnovaBase = "0.7"
Distributions = "0.23, 0.24, 0.25"
GLM = "1.5, 1.6, 1.7, 1.8"
GLM = "1.8"
Reexport = "0.2, 1"
StatsBase = "0.33"
StatsModels = "0.6"
julia = "1.6, 1.7"
julia = "1.6, 1.7, 1.8"
9 changes: 4 additions & 5 deletions src/AnovaGLM.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
module AnovaGLM

using Statistics, StatsBase, LinearAlgebra, Distributions, Reexport, Printf
using Statistics, StatsBase, LinearAlgebra, Distributions, Reexport, Printf, AnovaBase
@reexport using GLM, AnovaBase
import StatsBase: fit!, fit
import StatsModels: TableRegressionModel, vectorize, ModelFrame, ModelMatrix, response
import StatsModels: TableRegressionModel, vectorize, ModelFrame, ModelMatrix, response, asgn
import GLM: glm,
# Model
LinPredModel, AbstractGLM, GeneralizedLinearModel, LinearModel,
Expand All @@ -16,9 +16,8 @@ import GLM: glm,
updateμ!, cholfactors,
# other
FP, BlasReal, Link, dispersion, deviance, dof, dof_residual, nobs
import AnovaBase: lrt_nested, ftest_nested, formula, anova, nestedmodels, _diff, _diffn, subformula, selectcoef, dof, dof_residual, deviance, nobs, coefnames, extract_contrasts
using DataFrames: DataFrame, ByRow, combine

using AnovaBase: select_super_interaction, extract_contrasts, canonicalgoodnessoffit, subformula, predictors, dof_asgn, lrt_nested, ftest_nested, _diff, _diffn
import AnovaBase: anova, nestedmodels, anovatable, prednames
export anova_lm, anova_glm

include("anova.jl")
Expand Down
203 changes: 79 additions & 124 deletions src/anova.jl
Original file line number Diff line number Diff line change
@@ -1,169 +1,136 @@
# ===========================================================================================
# Main API
@doc """
anova(<models>...; test::Type{<: GoodnessOfFit}, <keyword arguments>)
anova(test::Type{<: GoodnessOfFit}, <models>...; <keyword arguments>)
"""
anova(<glmmodels>...; test::Type{<: GoodnessOfFit}, <keyword arguments>)
anova(test::Type{<: GoodnessOfFit}, <glmmodels>...; <keyword arguments>)
Analysis of variance.
Return `AnovaResult{M, test, N}`. See [`AnovaResult`](@ref) for details.
* `models`: model objects
# Arguments
* `glmmodels`: model objects
1. `TableRegressionModel{<: LinearModel}` fitted by `GLM.lm`
2. `TableRegressionModel{<: GeneralizedLinearModel}` fitted by `GLM.glm`
If mutiple models are provided, they should be nested and the last one is the most saturated.
If mutiple models are provided, they should be nested and the last one is the most complex.
* `test`: test statistics for goodness of fit. Available tests are [`LikelihoodRatioTest`](@ref) ([`LRT`](@ref)) and [`FTest`](@ref). The default is based on the model type.
1. `TableRegressionModel{<: LinearModel}`: `FTest`.
2. `TableRegressionModel{<: GeneralizedLinearModel}`: based on distribution function, see `canonicalgoodnessoffit`.
Other keyword arguments:
## Other keyword arguments
* When one model is provided:
1. `type` specifies type of anova (1, 2 or 3). Default value is 1.
* When multiple models are provided:
1. `check`: allows to check if models are nested. Defalut value is true. Some checkers are not implemented now.
2. `isnested`: true when models are checked as nested (manually or automatically). Defalut value is false.
Algorithm:
For the ith model, devᵢ is defined as the sum of [squared deviance residuals (unit deviance)](https://en.wikipedia.org/wiki/Deviance_(statistics)).
It is equivalent to the residual sum of squares for ordinary linear regression.
* F-test:
The attribute `deviance` is Δdevᵢ = devᵢ₋₁ - devᵢ.
F-statistic is then defined as Δdevᵢ/(squared dispersion × degree of freedom).
For type I and III ANOVA, F-statistic is computed directly by the variAnce-covariance matrix(vcov) of the saturated model; the deviance is calculated backward.
1. Type I:
First, calculate f as the upper factor of Cholesky factorization of vcov⁻¹ * β.
Then, for a factor that starts at ith row/column of vcov with n degree of freedom, the f-statistic is Σᵢⁱ⁺ⁿ⁻¹ fₖ²/n.
2. Type III:
For a factor occupying ith to jth row/column of vcov with n degree of freedom, f-statistic is (β[i:j]' * vcov[i:j, i:j]⁻¹ * β[i:j])/n.
* LRT:
The attribute `deviance` is devᵢ.
The likelihood ratio is defined as (devᵢ₋₁ - devᵢ)/squared dispersion.
For fitting new models and conducting anova at the same time, see [`anova_lm`](@ref) for `LinearModel`, [`anova_glm`](@ref) for `GeneralizedLinearModel`.
!!! note
For fitting new models and conducting anova at the same time, see [`anova_lm`](@ref) for `LinearModel`, [`anova_glm`](@ref) for `GeneralizedLinearModel`.
"""
anova(::Val{:AnovaGLM})
anova(::Type{<: GoodnessOfFit}, ::Vararg{TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}}})

anova(models::Vararg{TableRegressionModel{<: LinearModel, <: AbstractArray}, N};
test::Type{T} = FTest,
kwargs...) where {N, T <: GoodnessOfFit} =
anova(models::Vararg{TableRegressionModel{<: LinearModel}};
test::Type{<: GoodnessOfFit} = FTest,
kwargs...) =
anova(test, models...; kwargs...)

anova(models::Vararg{TableRegressionModel{<: GeneralizedLinearModel, <: AbstractArray}, N};
test::Type{T} = canonicalgoodnessoffit(models[1].model.rr.d),
kwargs...) where {N, T <: GoodnessOfFit} =
anova(models::Vararg{TableRegressionModel{<: GeneralizedLinearModel}};
test::Type{<: GoodnessOfFit} = canonicalgoodnessoffit(models[1].model.rr.d),
kwargs...) =
anova(test, models...; kwargs...)

# ==================================================================================================================
# ANOVA by F test
# LinearModels
const TRM_LM = TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel{<: GLM.GlmResp{T, <: Normal, IdentityLink}}}} where T

anova(::Type{FTest},
trm::TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel{<: GLM.GlmResp{T, <: Normal, IdentityLink}}}};
type::Int = 1,
kwargs...) where T = _anova_vcov(trm; type, kwargs...)

function _anova_vcov(trm::TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}};
type::Int = 1, kwargs...)
type in [1, 2, 3] || throw(ArgumentError("Invalid type"))

assign = trm.mm.assign
df = dof(assign)
filter!(>(0), df)
# May exist some floating point error from dof_residual
push!(df, round(Int, dof_residual(trm)))
df = tuple(df...)
if type in [1, 3]
# vcov methods
varβ = vcov(trm.model)
β = trm.model.pp.beta0
if type == 1
fs = abs2.(cholesky(Hermitian(inv(varβ))).U * β)
offset = first(assign) == 1 ? 0 : 1
fstat = ntuple(last(assign) - offset) do fix
sum(fs[findall(==(fix + offset), assign)]) / df[fix]
end
else
# calculate block by block
offset = first(assign) == 1 ? 0 : 1
fstat = ntuple(last(assign) - offset) do fix
select = findall(==(fix + offset), assign)
β[select]' * inv(varβ[select, select]) * β[select] / df[fix]
end
trm::TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}};
type::Int = 1, kwargs...) = anova(FTest, FullModel(trm, type, isnullable(trm.model), true); kwargs...)

function anova(::Type{FTest}, aovm::FullModel{<: TRM_LM})
f = formula(aovm.model)
assign = asgn(predictors(aovm))
fullasgn = asgn(f)
df = dof_asgn(assign)
varβ = vcov(aovm.model.model)
β = aovm.model.model.pp.beta0
offset = first(assign) + last(fullasgn) - last(assign) - 1
if aovm.type == 1
fs = abs2.(cholesky(Hermitian(inv(varβ))).U * β)
fstat = ntuple(last(fullasgn) - offset) do fix
sum(fs[findall(==(fix + offset), fullasgn)]) / df[fix]
end
elseif aovm.type == 2
fstat = ntuple(last(fullasgn) - offset) do fix
select1 = sort!(collect(select_super_interaction(f.rhs, fix + offset)))
select2 = setdiff(select1, fix + offset)
select1 = findall(in(select1), fullasgn)
select2 = findall(in(select2), fullasgn)
(β[select1]' * inv(varβ[select1, select1]) * β[select1] - β[select2]' * inv(varβ[select2, select2]) * β[select2]) / df[fix]
end
σ² = dispersion(trm.model, true)
devs = (fstat .* σ²..., σ²) .* df
else
# refit methods
devs = deviances(trm; type, kwargs...)
msr = devs ./ df
fstat = msr[1:end - 1] ./ dispersion(trm.model, true)
# calculate block by block
fstat = ntuple(last(fullasgn) - offset) do fix
select = findall(==(fix + offset), fullasgn)
β[select]' * inv(varβ[select, select]) * β[select] / df[fix]
end
end
pvalue = (ccdf.(FDist.(df[1:end - 1], last(df)), abs.(fstat))..., NaN)
AnovaResult{FTest}(trm, type, df, devs, (fstat..., NaN), pvalue, NamedTuple())
σ² = dispersion(aovm.model.model, true)
devs = @. fstat * σ² * df
dfr = round(Int, dof_residual(aovm.model))
pvalue = @. ccdf(FDist(df, dfr), abs(fstat))
AnovaResult{FTest}(aovm, df, devs, fstat, pvalue, NamedTuple())
end


function anova(::Type{FTest},
trm::TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}};
type::Int = 1, kwargs...)
type in [1, 2, 3] || throw(ArgumentError("Invalid type"))

assign = trm.mm.assign
devs = deviances(trm; type, kwargs...)
df = dof(assign)
filter!(>(0), df)
# May exist some floating point error from dof_residual
push!(df, round(Int, dof_residual(trm)))
length(df) == length(devs) + 1 && popfirst!(df)
df = tuple(df...)
aovm::FullModel{<: TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}}}; kwargs...)
devs = deviances(aovm; kwargs...)
assign = asgn(collect(predictors(aovm)))
#length(vdf) ≡ length(devs) + 1 && popfirst!(vdf)
df = dof_asgn(assign)
msr = devs ./ df
fstat = msr[1:end - 1] ./ dispersion(trm.model, true)
pvalue = (ccdf.(FDist.(df[1:end - 1], last(df)), abs.(fstat))..., NaN)
AnovaResult{FTest}(trm, type, df, devs, (fstat..., NaN), pvalue, NamedTuple())
fstat = msr ./ dispersion(aovm.model.model, true)
dfr = round(Int, dof_residual(aovm.model))
pvalue = @. ccdf(FDist(df, dfr), abs(fstat))
AnovaResult{FTest}(aovm, df, devs, fstat, pvalue, NamedTuple())
end

# ----------------------------------------------------------------------------------------
# ANOVA for genaralized linear models
# λ = -2ln(𝓛(̂θ₀)/𝓛(θ)) ~ χ²ₙ , n = difference of predictors
anova(::Type{LRT},
trm::TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}};
type::Int = 1, kwargs...) = anova(LRT, FullModel(trm, type, isnullable(trm.model), true); kwargs...)

function anova(::Type{LRT},
trm::TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}})
Δdev = deviances(trm, type = 1)
df = dof(trm.mm.assign)
filter!(>(0), df)
isnullable(trm.model) || popfirst!(df)
df = tuple(df...)
aovm::FullModel{<: TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}}})
Δdev = deviances(aovm)
assign = asgn(collect(predictors(aovm)))
#isnullable(trm.model) || popfirst!(vdf)
df = dof_asgn(assign)
# den = last(ss) / (nobs(trm) - dof(trm) + 1)
# lrstat = ss[1:end - 1] ./ den
σ² = dispersion(trm.model, true)
lrstat = Δdev[1:end - 1] ./ σ²
σ² = dispersion(aovm.model.model, true)
lrstat = Δdev ./ σ²
n = length(lrstat)
dev = collect(Δdev)
dev = push!(collect(Δdev), deviance(aovm.model))
i = n
while i > 0
dev[i] += dev[i + 1]
i -= 1
end
pval = ccdf.(Chisq.(df), abs.(lrstat))
AnovaResult{LRT}(trm, 1, df, tuple(dev[2:end]...), lrstat, pval, NamedTuple())
pval = @. ccdf(Chisq(df), abs(lrstat))
AnovaResult{LRT}(aovm, df, tuple(dev[2:end]...), lrstat, pval, NamedTuple())
end

# =================================================================================================================
# Nested models

function anova(::Type{FTest},
trms::Vararg{TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}}};
trms::Vararg{M};
check::Bool = true,
isnested::Bool = false)
isnested::Bool = false) where {M <: TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}}}
df = dof.(trms)
ord = sortperm(collect(df))
df = df[ord]
Expand All @@ -172,20 +139,20 @@ function anova(::Type{FTest},
# May exist some floating point error from dof_residual
# check comparable and nested
check && @warn "Could not check whether models are nested: results may not be meaningful"
ftest_nested(trms, df, dfr, deviance.(trms), dispersion(last(trms).model, true))
ftest_nested(NestedModels{M}(trms), df, dfr, deviance.(trms), dispersion(last(trms).model, true))
end

function anova(::Type{LRT},
trms::Vararg{<: TableRegressionModel};
trms::Vararg{M};
check::Bool = true,
isnested::Bool = false)
isnested::Bool = false) where {M <: TableRegressionModel{<: Union{LinearModel, GeneralizedLinearModel}}}
df = dof.(trms)
ord = sortperm(collect(df))
trms = trms[ord]
df = df[ord]
# check comparable and nested
check && @warn "Could not check whether models are nested: results may not be meaningful"
lrt_nested(trms, df, deviance.(trms), dispersion(last(trms).model, true))
lrt_nested(NestedModels{M}(trms), df, deviance.(trms), dispersion(last(trms).model, true))
end


Expand Down Expand Up @@ -257,16 +224,4 @@ function anova(test::Type{<: GoodnessOfFit}, ::Type{GeneralizedLinearModel}, X,
kwargs...)
trm = glm(X, y, d, l; kwargs...)
anova(test, trm; type, kwargs... )
end


"""
GLM.glm(f::FormulaTerm, df::DataFrame, d::Binomial, l::GLM.Link, args...; kwargs...)
Automatically transform dependent variable into 0/1 for family `Binomial`.
"""
GLM.glm(f::FormulaTerm, df::DataFrame, d::Binomial, l::Link, args...; kwargs...) =
fit(GeneralizedLinearModel, f,
combine(df, :, f.lhs.sym => ByRow(==(last(unique(df[!, f.lhs.sym])))) => f.lhs.sym),
d, l, args...; kwargs...)

end
Loading

0 comments on commit c2b1d27

Please sign in to comment.