upload code for additional experiments

luotuoqingshan · Feb 20, 2024 · 480e354 · 480e354
1 parent 7d5b65f
commit 480e354
Show file tree

Hide file tree

Showing 9 changed files with 716 additions and 0 deletions.
diff --git a/src/Exp5-BS-vs-DI-DSG/README.md b/src/Exp5-BS-vs-DI-DSG/README.md
@@ -0,0 +1,5 @@
+# Experiment 5: Binary Search vs Density Improvement on ordinary graphs
+Run
+```
+julia --project di-vs-bs.jl
+```
diff --git a/src/Exp5-BS-vs-DI-DSG/di-vs-bs.jl b/src/Exp5-BS-vs-DI-DSG/di-vs-bs.jl
@@ -0,0 +1,22 @@
+include("../header.jl")
+
+flowtol = 1e-8
+
+for dataset in [
+    "ca-AstroPh",
+    "ca-HepPh",
+    "email-Enron",
+    "com-amazon",
+    "com-youtube",
+]
+    preprocess_graph(dataset) 
+    data = load_graph(dataset)
+    A = data["A"]
+    @show dataset
+    @show size(A, 1), div(nnz(A), 2)
+
+    di_res = DSG_flow_density_improvement(A; flowtol=flowtol)
+    @show di_res["total_dt"], di_res["niter"], di_res["optval"]
+    bs_res = DSG_flow_binary_search(A; flowtol=flowtol)
+    @show bs_res["total_dt"], bs_res["niter"], bs_res["optval"] 
+end
diff --git a/src/Exp6-Flow-vs-Greedy/README.md b/src/Exp6-Flow-vs-Greedy/README.md
@@ -0,0 +1,10 @@
+# Experiment 6 Comparison with Greedy Peeling on Detecting Planted Dense Structures 
+Run
+```
+julia -p X --project exp-synthetic.jl 
+```
+where ```X``` is the number of workers you want to spawn, e.g. 10.
+
+The generated hypergraph data are saved at ```local-DHSG/data/HSBM/```,
+and the results are saved at ```local-DHSG/results/HSBM/```, and
+the plot is saved at ```local-DHSG/figs/HSBM```.
diff --git a/src/Exp6-Flow-vs-Greedy/bulkeval-synthetic.jl b/src/Exp6-Flow-vs-Greedy/bulkeval-synthetic.jl
@@ -0,0 +1,148 @@
+"""
+    make_jobs(params, jobs)
+
+Create jobs to be done in parallel.
+Compute the anchored densest subhypergraph for each ϵ and R.
+"""
+function make_jobs(
+    params::Vector{Tuple{Ti, Tf, Vector{Ti}, String}},
+    jobs::RemoteChannel, 
+) where {Ti <: Integer, Tf} 
+    for t in params 
+        put!(jobs, t)
+    end
+    for i = 1:length(workers())
+        put!(jobs, (-1, -1, -1, ""))
+    end
+end
+
+
+"""
+    do_jobs(jobs, results, H, Ht)
+
+Worker's job function, solving the anchored densest subhypergraph problem.
+"""
+function do_jobs(
+    jobs::RemoteChannel,
+    results::RemoteChannel,
+    H::SparseMatrixCSC{Tf, Ti},
+    Ht::SparseMatrixCSC{Tf, Ti},
+) where {Ti <: Integer, Tf}
+    while true
+        # i-th cluster, j-th trial
+        i, ϵ, R, penaltyType = take!(jobs)
+        if i == -1
+            break
+        end
+        type = "global"
+        if ϵ >= 1.0
+            type = "local"
+        end
+        if penaltyType in ["fracvol", "vol"]
+            flow_res = ADHSG_flow_density_improvement(H, Ht, R, ϵ, penaltyType;flowtol=1e-8, type=type)
+            _, n = size(H)
+            p = frac_volume_penalty(H, Ht, R, n, ϵ)
+            greedy_res = greedy_peeling(H, Ht, p) 
+            put!(results, (i, flow_res, greedy_res))
+        else
+            error("For this experiment, we use fracvol/vol penalty only.")
+        end
+    end
+end
+
+
+"""
+    bulkeval_ADHSG(H, Ht, ϵ, cluster, Rs, penaltyType)
+
+Solve the anchored densest subhypergraph problem for a bunch of Rs in parallel.
+The cluster is the ground truth planted dense structure.
+"""
+function bulkeval_ADHSG(
+    H::SparseMatrixCSC{Tf, Ti},
+    Ht::SparseMatrixCSC{Tf, Ti},
+    ϵ::Tf,
+    cluster::Vector{Ti},
+    Rs::Vector{Vector{Ti}},
+    penaltyType::String,
+) where {Ti <: Integer, Tf}
+    ntrial = length(Rs)
+    jobs = RemoteChannel(() -> Channel{Tuple}(ntrials + length(workers())))
+    results = RemoteChannel(() -> Channel{Tuple}(ntrials))
+    tasks = Tuple{Ti, Tf, Vector{Ti}, String}[]
+    for j = 1:length(Rs)
+        push!(tasks, (j, ϵ, Rs[j], penaltyType))
+    end
+    make_jobs(tasks, jobs)
+    for p in workers()
+        remote_do(do_jobs, p, jobs, results, H, Ht)
+    end  
+    njob = length(tasks)
+    flow_objs = zeros(Tf, ntrial)
+    flow_sizes = zeros(Ti, ntrial)
+    flow_Rprecisions = zeros(Tf, ntrial)
+    flow_F1scores = zeros(Tf, ntrial)
+    flow_dts = zeros(Tf, ntrial)
+    greedy_objs = zeros(Tf, ntrial)
+    greedy_sizes = zeros(Ti, ntrial)
+    greedy_Rprecisions = zeros(Tf, ntrial)
+    greedy_F1scores = zeros(Tf, ntrial)
+    greedy_dts = zeros(Tf, ntrial)
+    while njob > 0
+        i, flow_res, greedy_res = take!(results)
+        njob -= 1
+        println("$njob left.")
+        flow_objs[i] = flow_res["optval"]
+        flow_sizes[i] = length(flow_res["optsol"])
+        flow_Rprecisions[i] = precision(flow_res["optsol"], Rs[i]) 
+        flow_F1scores[i] = F1score(cluster, flow_res["optsol"])
+        flow_dts[i] = flow_res["total_dt"]
+        greedy_objs[i] = greedy_res["optval"]
+        greedy_sizes[i] = length(greedy_res["optsol"])
+        greedy_Rprecisions[i] = precision(greedy_res["optsol"], Rs[i]) 
+        greedy_F1scores[i] = F1score(cluster, greedy_res["optsol"])
+        greedy_dts[i] = greedy_res["total_dt"]
+    end
+    return Dict(
+        "flow_objs" => flow_objs,
+        "flow_sizes" => flow_sizes,
+        "flow_Rprecisions" => flow_Rprecisions,
+        "flow_F1scores" => flow_F1scores,
+        "flow_dts" => flow_dts,
+        "greedy_objs" => greedy_objs,
+        "greedy_sizes" => greedy_sizes,
+        "greedy_Rprecisions" => greedy_Rprecisions,
+        "greedy_F1scores" => greedy_F1scores,
+        "greedy_dts" => greedy_dts,
+    )
+end
+
+
+"""
+    plot_ADHSG!(axis, ϵs, result, label, col; yscale = identity)
+
+Plot the F1 score for each ϵ.  Show the mean and standard error. 
+"""
+function plot_ADHSG!(
+    axis,
+    ϵs::Vector{Tf},
+    result,
+    label,
+    col;
+    yscale = identity,
+) where{Tf}
+    result_mean = mean(result, dims=2)[:, 1]
+    lines!(axis, ϵs, result_mean, label=label, color=(col, 1), yscale=yscale)
+    n = size(result, 1)
+    lowcurve = zeros(Tf, n)
+    upcurve = zeros(Tf, n)
+    for i = 1:n
+        stderr = sem(result[i, :])
+        low = max(result_mean[i] - stderr, 0.0)
+        up = min(result_mean[i] + stderr, 1.0)
+        lowcurve[i] = low
+        upcurve[i] = up
+    end
+    band!(axis, ϵs, upcurve, lowcurve, color = (col, 0.4), yscale=yscale)
+    scatter!(axis, ϵs, result_mean, color=(col, 1), yscale=yscale)
+end
+
diff --git a/src/Exp6-Flow-vs-Greedy/exp-synthetic.jl b/src/Exp6-Flow-vs-Greedy/exp-synthetic.jl
@@ -0,0 +1,109 @@
+using Distributed
+@everywhere begin 
+    include("../header.jl")
+    include("gen-synthetic.jl")
+    include("bulkeval-synthetic.jl")
+end
+
+n = 1000
+ncluster = 30 
+m2 = 50000
+hep1 = 0.8
+hep2 = 0.8
+seedratio = 0.05
+Rratio = 1.5
+ntrials = 10 
+genType = "RW"
+
+# pick of epsilon is based on some non-exhausting observations
+# i.e. we observe on a small set of hypergraphs and decide 
+# which epsilon is best for each method
+ϵs = [1.0, 0.3]
+penaltyTypes = ["fracvol", "vol"]
+
+for m1 = 5000:5000:70000
+
+    # generate the hypergraph and save it
+    cluster_vertices, cluster_edges, vlabel = gen_cluster(n, m2, ncluster)
+    H_edges = hypergraph_SBM(n, ncluster, vlabel, m1, cluster_edges, hep1, hep2) 
+    clusters = Vector{Vector{Int64}}()
+    for i = 1:ncluster
+        C = findall(x -> vlabel[x] == i, 1:n)
+        push!(clusters, C)
+    end
+
+    H = elist2inc(H_edges, n)
+    Ht = sparse(H')
+    stats(H, Ht)
+    dataset = "HSBM-$n-$ncluster-$m1-$m2-$hep1-$hep2"
+    matwrite(homedir()*"/local-DHSG/data/HSBM/"*dataset*".mat",
+        Dict(
+            "H" => H,
+            "clusters" => clusters,
+        )
+    )
+
+    # generate the Rs and save
+    Rs_list = Vector{Vector{Int64}}[]
+    for i = 1:ncluster
+        C = clusters[i]
+        Csz = length(C)
+        Rs = Vector{Int64}[]
+        for j = 1:ntrials
+            seed = sample(C, Int64(round(seedratio*Csz)), replace=false)
+            R = generate_R(H, Ht, seed, Int64(round(Rratio*Csz)), genType)
+            push!(Rs, R)
+        end
+        push!(Rs_list, Rs)
+    end
+
+    matwrite(homedir()*"/local-DHSG/data/HSBM/"*dataset*"-Rs-$seedratio-$Rratio-$genType.mat",
+        Dict(
+            "Rs_list" => Rs_list,
+        )
+    )
+
+    for i = eachindex(ϵs) 
+        ϵ = ϵs[i]
+        penaltyType = penaltyTypes[i]
+        for j = 1:ncluster
+            res = bulkeval_ADHSG(H, Ht, ϵs[i], clusters[j], Rs_list[j], penaltyTypes[i])
+            matwrite(homedir()*"/local-DHSG/results/HSBM/"*dataset*"-$penaltyType-ϵ-$ϵ-cluster-$j.mat", res)
+        end
+    end
+end
+
+# Plot
+ϵs = [1.0, 0.3, 1.0, 0.3]
+penaltyTypes = ["fracvol", "vol", "fracvol", "vol"]
+penaltyLabels = ["FracVol", "Vol", "FracVol(Greedy)", "Vol(Greedy)"]
+colors = [:red, :blue, :green, :orange]
+type_prefix = ["flow_", "flow_", "greedy_", "greedy_"]
+fig = Figure(figure_padding=1.5, resolution=(530,265))
+axis = Axis(fig[1,1], 
+  xtickalign=1, ytickalign=1, ygridvisible=false, xgridvisible=false,
+  xlabelpadding=10, ylabelpadding=10, ylabel="F1 Score",
+  topspinevisible=false, rightspinevisible=false) 
+
+ratio = Vector(0.1:0.1:1.4)
+savepath=homedir()*"/local-DHSG/results/HSBM/"
+for i = 1:length(colors)
+    result = zeros(Float64, length(ratio), ncluster * ntrials)
+    penaltyType = penaltyTypes[i]
+    penaltyLabel = penaltyLabels[i]
+    ϵ = ϵs[i]
+    for j = 1:14 
+        m1 = j * 5000
+        dataset = "HSBM-$n-$ncluster-$m1-$m2-$hep1-$hep2"
+        for clusterid = 1:ncluster
+            res = matread(savepath*dataset*"-$penaltyType-ϵ-$ϵ-cluster-$clusterid.mat")
+            f1score = res[type_prefix[i]*"F1scores"]
+            result[j, (clusterid-1)*ntrials+1:clusterid * ntrials] .= f1score
+        end
+    end
+    plot_ADHSG!(axis, ratio, result, penaltyLabel, colors[i]; yscale=identity)
+end
+axis.xticks = 0.0:0.25:1.5
+axis.yticks = 0.0:0.2:0.8
+axislegend(axis, position=(0.02, 0.7), merge = true, unique = true)
+save(homedir()*"/local-DHSG/figs/HSBM/flow_vs_greedy_f1score-comparison.pdf", fig, pt_per_unit=1)