Skip to content

Commit

Permalink
merged 0.7+ syntax upgrade
Browse files Browse the repository at this point in the history
  • Loading branch information
AsafManela committed Oct 26, 2018
2 parents c17f22c + c3600b0 commit 3819340
Show file tree
Hide file tree
Showing 30 changed files with 822 additions and 553 deletions.
17 changes: 5 additions & 12 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ os:
- linux
- osx
julia:
- 0.6
- 0.7
- 1.0
- nightly
notifications:
email: false
Expand All @@ -17,20 +18,12 @@ matrix:
allow_failures:
- julia: nightly

## uncomment and modify the following lines to manually install system packages
#addons:
# apt: # apt-get for linux
# packages:
# - gfortran
#before_script: # homebrew for mac
# - if [ $TRAVIS_OS_NAME = osx ]; then brew install gcc; fi

script:
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
- julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("HurdleDMR"); Pkg.test("HurdleDMR", coverage=true)'
- julia -e 'using Pkg; Pkg.clone(pwd()); Pkg.build("HurdleDMR"); Pkg.test("HurdleDMR"; coverage=true)'

after_success:
- julia -e 'Pkg.add("Documenter")'
- julia -e 'cd(Pkg.dir("HurdleDMR")); include(joinpath("docs", "make.jl"))'
- julia -e 'cd(Pkg.dir("HurdleDMR")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
- julia -e 'using Pkg; cd(Pkg.dir("HurdleDMR")); include(joinpath("docs", "make.jl"))'
- julia -e 'using Pkg; cd(Pkg.dir("HurdleDMR")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
# - julia -e 'cd(Pkg.dir("HurdleDMR")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ It includes a Julia implementation of the Distributed Multinomial Regression (DM

Install the HurdleDMR package
```julia
Pkg.clone("https://github.com/AsafManela/Lasso.jl")
Pkg.clone("https://github.com/AsafManela/HurdleDMR.jl")
pkg> add HurdleDMR
```

Add parallel workers and make package available to workers
Expand Down
5 changes: 4 additions & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
julia 0.6 0.7-
julia 0.7
Lasso
StatsBase
StatsModels
DataFrames
LambertW
GLM
11 changes: 7 additions & 4 deletions appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
environment:
matrix:
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.7/julia-0.7-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.7/julia-0.7-latest-win64.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/1.0/julia-1.0-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/1.0/julia-1.0-latest-win64.exe"
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"

Expand Down Expand Up @@ -40,8 +42,9 @@ install:
build_script:
# Need to convert from shallow to complete for Pkg.clone to work
- IF EXIST .git\shallow (git fetch --unshallow)
- C:\projects\julia\bin\julia -e "versioninfo();
- C:\projects\julia\bin\julia -e "using InteractiveUtils; versioninfo();
using Pkg;
Pkg.clone(pwd(), \"HurdleDMR\"); Pkg.build(\"HurdleDMR\")"

test_script:
- C:\projects\julia\bin\julia -e "Pkg.test(\"HurdleDMR\")"
- C:\projects\julia\bin\julia -e "using Pkg; Pkg.test(\"HurdleDMR\")"
7 changes: 1 addition & 6 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,9 @@ It includes a Julia implementation of the Distributed Multinomial Regression (DM

## Setup

Install my fork of the Lasso package (will hopefully not be needed in the future)
```julia
Pkg.clone("https://github.com/AsafManela/Lasso.jl")
```

Install the HurdleDMR package
```julia
Pkg.clone("https://github.com/AsafManela/HurdleDMR.jl")
pkg> add HurdleDMR
```

Add parallel workers and make package available to workers
Expand Down
4 changes: 3 additions & 1 deletion src/HurdleDMR.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
module HurdleDMR

using Lasso, StatsBase, StatsModels, DataFrames, LambertW, GLM.FPVector, GLM.FP
using Lasso, StatsBase, StatsModels, DataFrames, LambertW,
SparseArrays, Distributed, SharedArrays, LinearAlgebra
using GLM: FPVector, FP

export dmr, dmrpaths, hdmr, hdmrpaths, fit, coef, srproj, srprojX, @~, mcdmr, posindic,
DCR, DMR, HDMR, DMRCoefs, DMRPaths, HDMRCoefs, HDMRPaths, @model, Model,
Expand Down
79 changes: 37 additions & 42 deletions src/dmr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ abstract type DMR <: DCR end
Relatively heavy object used to return DMR results when we care about the regulatrization paths.
"""
struct DMRPaths <: DMR
nlpaths::Vector{Nullable{GammaLassoPath}} # independent Poisson GammaLassoPath for each phrase
nlpaths::Vector{Union{Missing,GammaLassoPath}} # independent Poisson GammaLassoPath for each phrase
intercept::Bool # whether to include an intercept in each Poisson regression
# (only kept with remote cluster, not with local cluster)
n::Int # number of observations. May be lower than provided after removing all zero obs.
d::Int # number of categories (terms/words/phrases)
p::Int # number of covariates

DMRPaths(nlpaths::Vector{Nullable{GammaLassoPath}}, intercept::Bool,
DMRPaths(nlpaths::Vector{Union{Missing,GammaLassoPath}}, intercept::Bool,
n::Int, d::Int, p::Int) =
new(nlpaths, intercept, n, d, p)
end
Expand Down Expand Up @@ -157,16 +157,16 @@ function StatsBase.coef(m::DMRPaths; select=:AICc)
d = length(m.nlpaths)
d < 1 && return nothing

# drop null paths
nonnullpaths = dropnull(m.nlpaths)
# drop missing paths
nonmsngpaths = skipmissing(m.nlpaths)

# get number of coefs from paths object
p = ncoefs(m)

# establish maximum path lengths
= 0
if size(nonnullpaths,1) > 0
= maximum(broadcast(nlpath->size(nlpath.value)[2],nonnullpaths))
if !isempty(nonmsngpaths)
= maximum(size(nlpath)[2] for nlpath in nonmsngpaths)
end

# allocate space
Expand All @@ -178,9 +178,8 @@ function StatsBase.coef(m::DMRPaths; select=:AICc)

# iterate over paths
for j=1:d
nlpath = m.nlpaths[j]
if !Base.isnull(nlpath)
path = nlpath.value
path = m.nlpaths[j]
if !ismissing(path)
cj = coef(path;select=select)
if select==:all
for i=1:p
Expand Down Expand Up @@ -215,7 +214,7 @@ ncovars(m::DMR) = m.p
ncoefs(m::DMR) = ncovars(m) + (hasintercept(m) ? 1 : 0)

# some helpers for converting to SharedArray
Base.convert(::Type{SharedArray}, A::SubArray) = (S = SharedArray{eltype(A)}(size(A)); copy!(S, A))
Base.convert(::Type{SharedArray}, A::SubArray) = (S = SharedArray{eltype(A)}(size(A)); copyto!(S, A))
function Base.convert(::Type{SharedArray}, A::SparseMatrixCSC{T,N}) where {T,N}
S = SharedArray{T}(size(A))
fill!(S,zero(T))
Expand All @@ -239,16 +238,16 @@ fpcounts(counts::M) where {V<:GLM.FP, N, M<:SparseMatrixCSC{V,N}} = counts
fpcounts(counts::M) where {V<:GLM.FP, M<:AbstractMatrix{V}} = counts

"Computes DMR shifters (μ=log(m)) and removes all zero observations"
function shifters(covars::AbstractMatrix{T}, counts::AbstractMatrix, showwarnings::Bool) where {T<:AbstractFloat}
function shifters(covars::AbstractMatrix{T}, counts::AbstractMatrix{C}, showwarnings::Bool) where {T<:AbstractFloat,C}
# standardize counts matrix to conform to GLM.FP
counts = fpcounts(counts)

m = vec(sum(counts,2))
m = vec(sum(counts, dims=2))

if any(iszero,m)
# omit observations with no counts
ixposm = find(m)
showwarnings && warn("omitting $(length(m)-length(ixposm)) observations with no counts")
ixposm = findall(x->x!=zero(C), m)
showwarnings && @warn("omitting $(length(m)-length(ixposm)) observations with no counts")
m = m[ixposm]
counts = counts[ixposm,:]
covars = covars[ixposm,:]
Expand All @@ -264,13 +263,13 @@ end
"""
This version is built for local clusters and shares memory used by both inputs and outputs if run in parallel mode.
"""
function dmr_local_cluster{T<:AbstractFloat,V}(covars::AbstractMatrix{T},counts::AbstractMatrix{V},
parallel,verbose,showwarnings,intercept; kwargs...)
function dmr_local_cluster(covars::AbstractMatrix{T},counts::AbstractMatrix{V},
parallel,verbose,showwarnings,intercept; kwargs...) where {T<:AbstractFloat,V}
# get dimensions
n, d = size(counts)
n1,p = size(covars)
@assert n==n1 "counts and covars should have the same number of observations"
verbose && info("fitting $n observations on $d categories, $p covariates ")
verbose && @info("fitting $n observations on $d categories, $p covariates ")

# add one coef for intercept
ncoef = p + (intercept ? 1 : 0)
Expand All @@ -279,18 +278,18 @@ function dmr_local_cluster{T<:AbstractFloat,V}(covars::AbstractMatrix{T},counts:

# fit separate GammaLassoPath's to each dimension of counts j=1:d and pick its min AICc segment
if parallel
verbose && info("distributed poisson run on local cluster with $(nworkers()) nodes")
verbose && @info("distributed poisson run on local cluster with $(nworkers()) nodes")
counts = convert(SharedArray,counts)
coefs = SharedMatrix{T}(ncoef,d)
covars = convert(SharedArray,covars)
# μ = convert(SharedArray,μ) incompatible with GLM

@sync @parallel for j=1:d
@sync @distributed for j=1:d
tryfitgl!(coefs, j, covars, counts; offset=μ, verbose=false, showwarnings=showwarnings, intercept=intercept, kwargs...)
end
else
verbose && info("serial poisson run on a single node")
coefs = Matrix{T}(ncoef,d)
verbose && @info("serial poisson run on a single node")
coefs = Matrix{T}(undef,ncoef,d)
for j=1:d
tryfitgl!(coefs, j, covars, counts; offset=μ, verbose=false, showwarnings=showwarnings, intercept=intercept, kwargs...)
end
Expand All @@ -300,8 +299,8 @@ function dmr_local_cluster{T<:AbstractFloat,V}(covars::AbstractMatrix{T},counts:
end

"This version does not share memory across workers, so may be more efficient for small problems, or on remote clusters."
function dmr_remote_cluster{T<:AbstractFloat,V}(covars::AbstractMatrix{T},counts::AbstractMatrix{V},
parallel,verbose,showwarnings,intercept; kwargs...)
function dmr_remote_cluster(covars::AbstractMatrix{T},counts::AbstractMatrix{V},
parallel,verbose,showwarnings,intercept; kwargs...) where {T<:AbstractFloat,V}
paths = dmrpaths(covars, counts; parallel=parallel, verbose=verbose, showwarnings=showwarnings, intercept=intercept, kwargs...)
DMRCoefs(paths)
end
Expand All @@ -316,39 +315,40 @@ function dmrpaths(covars::AbstractMatrix{T},counts::AbstractMatrix;
n, d = size(counts)
n1,p = size(covars)
@assert n==n1 "counts and covars should have the same number of observations"
verbose && info("fitting $n observations on $d categories, $p covariates ")
verbose && @info("fitting $n observations on $d categories, $p covariates ")

covars, counts, μ, n = shifters(covars, counts, showwarnings)

function tryfitgl(countsj::AbstractVector)
try
# we make it dense remotely to reduce communication costs
Nullable{GammaLassoPath}(fit(GammaLassoPath,covars,full(countsj),Poisson(),LogLink(); offset=μ, verbose=false, kwargs...))
fit(GammaLassoPath,covars,Vector(countsj),Poisson(),LogLink(); offset=μ, verbose=false, kwargs...)
catch e
showwarnings && warn("fitgl failed for countsj with frequencies $(sort(countmap(countsj))) and will return null path ($e)")
Nullable{GammaLassoPath}()
showwarnings && @warn("fitgl failed for countsj with frequencies $(sort(countmap(countsj))) and will return missing path ($e)")
missing
end
end

# counts generator
countscols = (counts[:,j] for j=1:d)

if parallel
verbose && info("distributed poisson run on remote cluster with $(nworkers()) nodes")
verbose && @info("distributed poisson run on remote cluster with $(nworkers()) nodes")
mapfn = pmap
else
verbose && info("serial poisson run on a single node")
verbose && @info("serial poisson run on a single node")
mapfn = map
end

nlpaths = convert(Vector{Nullable{GammaLassoPath}},mapfn(tryfitgl,countscols))
# TODO: the conversion here may be redudant
nlpaths = convert(Vector{Union{Missing,GammaLassoPath}},mapfn(tryfitgl,countscols))

DMRPaths(nlpaths, intercept, n, d, p)
end

"Fits a regularized poisson regression counts[:,j] ~ covars saving the coefficients in coefs[:,j]"
function poisson_regression!(coefs::AbstractMatrix{T}, j::Int, covars::AbstractMatrix{T},counts::AbstractMatrix{V}; kwargs...) where {T<:AbstractFloat,V}
cj = vec(full(counts[:,j]))
cj = Vector(counts[:,j])
path = fit(GammaLassoPath,covars,cj,Poisson(),LogLink(); kwargs...)
coefs[:,j] = coef(path;select=:AICc)
nothing
Expand All @@ -361,7 +361,7 @@ function tryfitgl!(coefs::AbstractMatrix{T}, j::Int, covars::AbstractMatrix{T},c
try
poisson_regression!(coefs, j, covars, counts; kwargs...)
catch e
showwarnings && warn("fitgl! failed on count dimension $j with frequencies $(sort(countmap(counts[:,j]))) and will return zero coefs ($e)")
showwarnings && @warn("fitgl! failed on count dimension $j with frequencies $(sort(countmap(counts[:,j]))) and will return zero coefs ($e)")
# redudant ASSUMING COEFS ARRAY INTIAILLY FILLED WITH ZEROS, but can be uninitialized in serial case
for i=1:size(coefs,1)
coefs[i,j] = zero(T)
Expand All @@ -370,23 +370,18 @@ function tryfitgl!(coefs::AbstractMatrix{T}, j::Int, covars::AbstractMatrix{T},c
end

"Shorthand for fit(DMR,covars,counts). See also [`fit(::DMR)`](@ref)"
function dmr{T<:AbstractFloat,V}(covars::AbstractMatrix{T},counts::AbstractMatrix{V};
function dmr(covars::AbstractMatrix{T},counts::AbstractMatrix{V};
intercept=true,
parallel=true, local_cluster=true,
verbose=true, showwarnings=false,
kwargs...)
kwargs...) where {T<:AbstractFloat,V}
if local_cluster || !parallel
dmr_local_cluster(covars,counts,parallel,verbose,showwarnings,intercept; kwargs...)
else
dmr_remote_cluster(covars,counts,parallel,verbose,showwarnings,intercept; kwargs...)
end
end

"Drops null elements from a Nullable vector"
dropnull{T<:Nullable}(v::Vector{T}) = v[.!isnull.(v)]

# aicc{R<:RegularizationPath}(paths::Vector{R}; k=2) = broadcast(path->Lasso.aicc(path;k=k),paths)

# We take care of the intercept ourselves, without relying on StatsModels, because
# it is unregulated, so we drop it from formula
StatsModels.drop_intercept(::Type{T}) where {T<:DCR} = true
Expand Down Expand Up @@ -436,12 +431,12 @@ function _predict(m, newcovars::AbstractMatrix{T};
η = zeros(T,newn,m.d)
for j=1:m.d
path = m.nlpaths[j]
if !isnull(path)
η[:,j] = predict(path.value,newcovars;offset=newoffset, select=select, kwargs...)
if !ismissing(path)
η[:,j] = predict(path, newcovars;offset=newoffset, select=select, kwargs...)
end
end

scale!(one(T)./vec(sum(η,2)),η)
lmul!(Diagonal(one(T)./vec(sum(η, dims=2))),η)

η
end
Expand Down
Loading

0 comments on commit 3819340

Please sign in to comment.