Skip to content

Commit

Permalink
upload code for additional experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
luotuoqingshan committed Feb 20, 2024
1 parent 7d5b65f commit 480e354
Show file tree
Hide file tree
Showing 9 changed files with 716 additions and 0 deletions.
5 changes: 5 additions & 0 deletions src/Exp5-BS-vs-DI-DSG/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Experiment 5: Binary Search vs Density Improvement on ordinary graphs
Run
```
julia --project di-vs-bs.jl
```
22 changes: 22 additions & 0 deletions src/Exp5-BS-vs-DI-DSG/di-vs-bs.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
include("../header.jl")

flowtol = 1e-8

for dataset in [
"ca-AstroPh",
"ca-HepPh",
"email-Enron",
"com-amazon",
"com-youtube",
]
preprocess_graph(dataset)
data = load_graph(dataset)
A = data["A"]
@show dataset
@show size(A, 1), div(nnz(A), 2)

di_res = DSG_flow_density_improvement(A; flowtol=flowtol)
@show di_res["total_dt"], di_res["niter"], di_res["optval"]
bs_res = DSG_flow_binary_search(A; flowtol=flowtol)
@show bs_res["total_dt"], bs_res["niter"], bs_res["optval"]
end
10 changes: 10 additions & 0 deletions src/Exp6-Flow-vs-Greedy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Experiment 6 Comparison with Greedy Peeling on Detecting Planted Dense Structures
Run
```
julia -p X --project exp-synthetic.jl
```
where ```X``` is the number of workers you want to spawn, e.g. 10.

The generated hypergraph data are saved at ```local-DHSG/data/HSBM/```,
and the results are saved at ```local-DHSG/results/HSBM/```, and
the plot is saved at ```local-DHSG/figs/HSBM```.
148 changes: 148 additions & 0 deletions src/Exp6-Flow-vs-Greedy/bulkeval-synthetic.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""
make_jobs(params, jobs)
Create jobs to be done in parallel.
Compute the anchored densest subhypergraph for each ϵ and R.
"""
function make_jobs(
params::Vector{Tuple{Ti, Tf, Vector{Ti}, String}},
jobs::RemoteChannel,
) where {Ti <: Integer, Tf}
for t in params
put!(jobs, t)
end
for i = 1:length(workers())
put!(jobs, (-1, -1, -1, ""))
end
end


"""
do_jobs(jobs, results, H, Ht)
Worker's job function, solving the anchored densest subhypergraph problem.
"""
function do_jobs(
jobs::RemoteChannel,
results::RemoteChannel,
H::SparseMatrixCSC{Tf, Ti},
Ht::SparseMatrixCSC{Tf, Ti},
) where {Ti <: Integer, Tf}
while true
# i-th cluster, j-th trial
i, ϵ, R, penaltyType = take!(jobs)
if i == -1
break
end
type = "global"
if ϵ >= 1.0
type = "local"
end
if penaltyType in ["fracvol", "vol"]
flow_res = ADHSG_flow_density_improvement(H, Ht, R, ϵ, penaltyType;flowtol=1e-8, type=type)
_, n = size(H)
p = frac_volume_penalty(H, Ht, R, n, ϵ)
greedy_res = greedy_peeling(H, Ht, p)
put!(results, (i, flow_res, greedy_res))
else
error("For this experiment, we use fracvol/vol penalty only.")
end
end
end


"""
bulkeval_ADHSG(H, Ht, ϵ, cluster, Rs, penaltyType)
Solve the anchored densest subhypergraph problem for a bunch of Rs in parallel.
The cluster is the ground truth planted dense structure.
"""
function bulkeval_ADHSG(
H::SparseMatrixCSC{Tf, Ti},
Ht::SparseMatrixCSC{Tf, Ti},
ϵ::Tf,
cluster::Vector{Ti},
Rs::Vector{Vector{Ti}},
penaltyType::String,
) where {Ti <: Integer, Tf}
ntrial = length(Rs)
jobs = RemoteChannel(() -> Channel{Tuple}(ntrials + length(workers())))
results = RemoteChannel(() -> Channel{Tuple}(ntrials))
tasks = Tuple{Ti, Tf, Vector{Ti}, String}[]
for j = 1:length(Rs)
push!(tasks, (j, ϵ, Rs[j], penaltyType))
end
make_jobs(tasks, jobs)
for p in workers()
remote_do(do_jobs, p, jobs, results, H, Ht)
end
njob = length(tasks)
flow_objs = zeros(Tf, ntrial)
flow_sizes = zeros(Ti, ntrial)
flow_Rprecisions = zeros(Tf, ntrial)
flow_F1scores = zeros(Tf, ntrial)
flow_dts = zeros(Tf, ntrial)
greedy_objs = zeros(Tf, ntrial)
greedy_sizes = zeros(Ti, ntrial)
greedy_Rprecisions = zeros(Tf, ntrial)
greedy_F1scores = zeros(Tf, ntrial)
greedy_dts = zeros(Tf, ntrial)
while njob > 0
i, flow_res, greedy_res = take!(results)
njob -= 1
println("$njob left.")
flow_objs[i] = flow_res["optval"]
flow_sizes[i] = length(flow_res["optsol"])
flow_Rprecisions[i] = precision(flow_res["optsol"], Rs[i])
flow_F1scores[i] = F1score(cluster, flow_res["optsol"])
flow_dts[i] = flow_res["total_dt"]
greedy_objs[i] = greedy_res["optval"]
greedy_sizes[i] = length(greedy_res["optsol"])
greedy_Rprecisions[i] = precision(greedy_res["optsol"], Rs[i])
greedy_F1scores[i] = F1score(cluster, greedy_res["optsol"])
greedy_dts[i] = greedy_res["total_dt"]
end
return Dict(
"flow_objs" => flow_objs,
"flow_sizes" => flow_sizes,
"flow_Rprecisions" => flow_Rprecisions,
"flow_F1scores" => flow_F1scores,
"flow_dts" => flow_dts,
"greedy_objs" => greedy_objs,
"greedy_sizes" => greedy_sizes,
"greedy_Rprecisions" => greedy_Rprecisions,
"greedy_F1scores" => greedy_F1scores,
"greedy_dts" => greedy_dts,
)
end


"""
plot_ADHSG!(axis, ϵs, result, label, col; yscale = identity)
Plot the F1 score for each ϵ. Show the mean and standard error.
"""
function plot_ADHSG!(
axis,
ϵs::Vector{Tf},
result,
label,
col;
yscale = identity,
) where{Tf}
result_mean = mean(result, dims=2)[:, 1]
lines!(axis, ϵs, result_mean, label=label, color=(col, 1), yscale=yscale)
n = size(result, 1)
lowcurve = zeros(Tf, n)
upcurve = zeros(Tf, n)
for i = 1:n
stderr = sem(result[i, :])
low = max(result_mean[i] - stderr, 0.0)
up = min(result_mean[i] + stderr, 1.0)
lowcurve[i] = low
upcurve[i] = up
end
band!(axis, ϵs, upcurve, lowcurve, color = (col, 0.4), yscale=yscale)
scatter!(axis, ϵs, result_mean, color=(col, 1), yscale=yscale)
end

109 changes: 109 additions & 0 deletions src/Exp6-Flow-vs-Greedy/exp-synthetic.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
using Distributed
@everywhere begin
include("../header.jl")
include("gen-synthetic.jl")
include("bulkeval-synthetic.jl")
end

n = 1000
ncluster = 30
m2 = 50000
hep1 = 0.8
hep2 = 0.8
seedratio = 0.05
Rratio = 1.5
ntrials = 10
genType = "RW"

# pick of epsilon is based on some non-exhausting observations
# i.e. we observe on a small set of hypergraphs and decide
# which epsilon is best for each method
ϵs = [1.0, 0.3]
penaltyTypes = ["fracvol", "vol"]

for m1 = 5000:5000:70000

# generate the hypergraph and save it
cluster_vertices, cluster_edges, vlabel = gen_cluster(n, m2, ncluster)
H_edges = hypergraph_SBM(n, ncluster, vlabel, m1, cluster_edges, hep1, hep2)
clusters = Vector{Vector{Int64}}()
for i = 1:ncluster
C = findall(x -> vlabel[x] == i, 1:n)
push!(clusters, C)
end

H = elist2inc(H_edges, n)
Ht = sparse(H')
stats(H, Ht)
dataset = "HSBM-$n-$ncluster-$m1-$m2-$hep1-$hep2"
matwrite(homedir()*"/local-DHSG/data/HSBM/"*dataset*".mat",
Dict(
"H" => H,
"clusters" => clusters,
)
)

# generate the Rs and save
Rs_list = Vector{Vector{Int64}}[]
for i = 1:ncluster
C = clusters[i]
Csz = length(C)
Rs = Vector{Int64}[]
for j = 1:ntrials
seed = sample(C, Int64(round(seedratio*Csz)), replace=false)
R = generate_R(H, Ht, seed, Int64(round(Rratio*Csz)), genType)
push!(Rs, R)
end
push!(Rs_list, Rs)
end

matwrite(homedir()*"/local-DHSG/data/HSBM/"*dataset*"-Rs-$seedratio-$Rratio-$genType.mat",
Dict(
"Rs_list" => Rs_list,
)
)

for i = eachindex(ϵs)
ϵ = ϵs[i]
penaltyType = penaltyTypes[i]
for j = 1:ncluster
res = bulkeval_ADHSG(H, Ht, ϵs[i], clusters[j], Rs_list[j], penaltyTypes[i])
matwrite(homedir()*"/local-DHSG/results/HSBM/"*dataset*"-$penaltyType-ϵ--cluster-$j.mat", res)
end
end
end

# Plot
ϵs = [1.0, 0.3, 1.0, 0.3]
penaltyTypes = ["fracvol", "vol", "fracvol", "vol"]
penaltyLabels = ["FracVol", "Vol", "FracVol(Greedy)", "Vol(Greedy)"]
colors = [:red, :blue, :green, :orange]
type_prefix = ["flow_", "flow_", "greedy_", "greedy_"]
fig = Figure(figure_padding=1.5, resolution=(530,265))
axis = Axis(fig[1,1],
xtickalign=1, ytickalign=1, ygridvisible=false, xgridvisible=false,
xlabelpadding=10, ylabelpadding=10, ylabel="F1 Score",
topspinevisible=false, rightspinevisible=false)

ratio = Vector(0.1:0.1:1.4)
savepath=homedir()*"/local-DHSG/results/HSBM/"
for i = 1:length(colors)
result = zeros(Float64, length(ratio), ncluster * ntrials)
penaltyType = penaltyTypes[i]
penaltyLabel = penaltyLabels[i]
ϵ = ϵs[i]
for j = 1:14
m1 = j * 5000
dataset = "HSBM-$n-$ncluster-$m1-$m2-$hep1-$hep2"
for clusterid = 1:ncluster
res = matread(savepath*dataset*"-$penaltyType-ϵ--cluster-$clusterid.mat")
f1score = res[type_prefix[i]*"F1scores"]
result[j, (clusterid-1)*ntrials+1:clusterid * ntrials] .= f1score
end
end
plot_ADHSG!(axis, ratio, result, penaltyLabel, colors[i]; yscale=identity)
end
axis.xticks = 0.0:0.25:1.5
axis.yticks = 0.0:0.2:0.8
axislegend(axis, position=(0.02, 0.7), merge = true, unique = true)
save(homedir()*"/local-DHSG/figs/HSBM/flow_vs_greedy_f1score-comparison.pdf", fig, pt_per_unit=1)
Loading

0 comments on commit 480e354

Please sign in to comment.