diff --git a/src/Exp5-BS-vs-DI-DSG/README.md b/src/Exp5-BS-vs-DI-DSG/README.md new file mode 100644 index 0000000..250f658 --- /dev/null +++ b/src/Exp5-BS-vs-DI-DSG/README.md @@ -0,0 +1,5 @@ +# Experiment 5: Binary Search vs Density Improvement on ordinary graphs +Run +``` +julia --project di-vs-bs.jl +``` \ No newline at end of file diff --git a/src/Exp5-BS-vs-DI-DSG/di-vs-bs.jl b/src/Exp5-BS-vs-DI-DSG/di-vs-bs.jl new file mode 100644 index 0000000..494b419 --- /dev/null +++ b/src/Exp5-BS-vs-DI-DSG/di-vs-bs.jl @@ -0,0 +1,22 @@ +include("../header.jl") + +flowtol = 1e-8 + +for dataset in [ + "ca-AstroPh", + "ca-HepPh", + "email-Enron", + "com-amazon", + "com-youtube", +] + preprocess_graph(dataset) + data = load_graph(dataset) + A = data["A"] + @show dataset + @show size(A, 1), div(nnz(A), 2) + + di_res = DSG_flow_density_improvement(A; flowtol=flowtol) + @show di_res["total_dt"], di_res["niter"], di_res["optval"] + bs_res = DSG_flow_binary_search(A; flowtol=flowtol) + @show bs_res["total_dt"], bs_res["niter"], bs_res["optval"] +end diff --git a/src/Exp6-Flow-vs-Greedy/README.md b/src/Exp6-Flow-vs-Greedy/README.md new file mode 100644 index 0000000..4ca1d0e --- /dev/null +++ b/src/Exp6-Flow-vs-Greedy/README.md @@ -0,0 +1,10 @@ +# Experiment 6 Comparison with Greedy Peeling on Detecting Planted Dense Structures +Run +``` +julia -p X --project exp-synthetic.jl +``` +where ```X``` is the number of workers you want to spawn, e.g. 10. + +The generated hypergraph data are saved at ```local-DHSG/data/HSBM/```, +and the results are saved at ```local-DHSG/results/HSBM/```, and +the plot is saved at ```local-DHSG/figs/HSBM```. \ No newline at end of file diff --git a/src/Exp6-Flow-vs-Greedy/bulkeval-synthetic.jl b/src/Exp6-Flow-vs-Greedy/bulkeval-synthetic.jl new file mode 100644 index 0000000..c6e9eb0 --- /dev/null +++ b/src/Exp6-Flow-vs-Greedy/bulkeval-synthetic.jl @@ -0,0 +1,148 @@ +""" + make_jobs(params, jobs) + +Create jobs to be done in parallel. +Compute the anchored densest subhypergraph for each ϵ and R. +""" +function make_jobs( + params::Vector{Tuple{Ti, Tf, Vector{Ti}, String}}, + jobs::RemoteChannel, +) where {Ti <: Integer, Tf} + for t in params + put!(jobs, t) + end + for i = 1:length(workers()) + put!(jobs, (-1, -1, -1, "")) + end +end + + +""" + do_jobs(jobs, results, H, Ht) + +Worker's job function, solving the anchored densest subhypergraph problem. +""" +function do_jobs( + jobs::RemoteChannel, + results::RemoteChannel, + H::SparseMatrixCSC{Tf, Ti}, + Ht::SparseMatrixCSC{Tf, Ti}, +) where {Ti <: Integer, Tf} + while true + # i-th cluster, j-th trial + i, ϵ, R, penaltyType = take!(jobs) + if i == -1 + break + end + type = "global" + if ϵ >= 1.0 + type = "local" + end + if penaltyType in ["fracvol", "vol"] + flow_res = ADHSG_flow_density_improvement(H, Ht, R, ϵ, penaltyType;flowtol=1e-8, type=type) + _, n = size(H) + p = frac_volume_penalty(H, Ht, R, n, ϵ) + greedy_res = greedy_peeling(H, Ht, p) + put!(results, (i, flow_res, greedy_res)) + else + error("For this experiment, we use fracvol/vol penalty only.") + end + end +end + + +""" + bulkeval_ADHSG(H, Ht, ϵ, cluster, Rs, penaltyType) + +Solve the anchored densest subhypergraph problem for a bunch of Rs in parallel. +The cluster is the ground truth planted dense structure. +""" +function bulkeval_ADHSG( + H::SparseMatrixCSC{Tf, Ti}, + Ht::SparseMatrixCSC{Tf, Ti}, + ϵ::Tf, + cluster::Vector{Ti}, + Rs::Vector{Vector{Ti}}, + penaltyType::String, +) where {Ti <: Integer, Tf} + ntrial = length(Rs) + jobs = RemoteChannel(() -> Channel{Tuple}(ntrials + length(workers()))) + results = RemoteChannel(() -> Channel{Tuple}(ntrials)) + tasks = Tuple{Ti, Tf, Vector{Ti}, String}[] + for j = 1:length(Rs) + push!(tasks, (j, ϵ, Rs[j], penaltyType)) + end + make_jobs(tasks, jobs) + for p in workers() + remote_do(do_jobs, p, jobs, results, H, Ht) + end + njob = length(tasks) + flow_objs = zeros(Tf, ntrial) + flow_sizes = zeros(Ti, ntrial) + flow_Rprecisions = zeros(Tf, ntrial) + flow_F1scores = zeros(Tf, ntrial) + flow_dts = zeros(Tf, ntrial) + greedy_objs = zeros(Tf, ntrial) + greedy_sizes = zeros(Ti, ntrial) + greedy_Rprecisions = zeros(Tf, ntrial) + greedy_F1scores = zeros(Tf, ntrial) + greedy_dts = zeros(Tf, ntrial) + while njob > 0 + i, flow_res, greedy_res = take!(results) + njob -= 1 + println("$njob left.") + flow_objs[i] = flow_res["optval"] + flow_sizes[i] = length(flow_res["optsol"]) + flow_Rprecisions[i] = precision(flow_res["optsol"], Rs[i]) + flow_F1scores[i] = F1score(cluster, flow_res["optsol"]) + flow_dts[i] = flow_res["total_dt"] + greedy_objs[i] = greedy_res["optval"] + greedy_sizes[i] = length(greedy_res["optsol"]) + greedy_Rprecisions[i] = precision(greedy_res["optsol"], Rs[i]) + greedy_F1scores[i] = F1score(cluster, greedy_res["optsol"]) + greedy_dts[i] = greedy_res["total_dt"] + end + return Dict( + "flow_objs" => flow_objs, + "flow_sizes" => flow_sizes, + "flow_Rprecisions" => flow_Rprecisions, + "flow_F1scores" => flow_F1scores, + "flow_dts" => flow_dts, + "greedy_objs" => greedy_objs, + "greedy_sizes" => greedy_sizes, + "greedy_Rprecisions" => greedy_Rprecisions, + "greedy_F1scores" => greedy_F1scores, + "greedy_dts" => greedy_dts, + ) +end + + +""" + plot_ADHSG!(axis, ϵs, result, label, col; yscale = identity) + +Plot the F1 score for each ϵ. Show the mean and standard error. +""" +function plot_ADHSG!( + axis, + ϵs::Vector{Tf}, + result, + label, + col; + yscale = identity, +) where{Tf} + result_mean = mean(result, dims=2)[:, 1] + lines!(axis, ϵs, result_mean, label=label, color=(col, 1), yscale=yscale) + n = size(result, 1) + lowcurve = zeros(Tf, n) + upcurve = zeros(Tf, n) + for i = 1:n + stderr = sem(result[i, :]) + low = max(result_mean[i] - stderr, 0.0) + up = min(result_mean[i] + stderr, 1.0) + lowcurve[i] = low + upcurve[i] = up + end + band!(axis, ϵs, upcurve, lowcurve, color = (col, 0.4), yscale=yscale) + scatter!(axis, ϵs, result_mean, color=(col, 1), yscale=yscale) +end + diff --git a/src/Exp6-Flow-vs-Greedy/exp-synthetic.jl b/src/Exp6-Flow-vs-Greedy/exp-synthetic.jl new file mode 100644 index 0000000..abe44f3 --- /dev/null +++ b/src/Exp6-Flow-vs-Greedy/exp-synthetic.jl @@ -0,0 +1,109 @@ +using Distributed +@everywhere begin + include("../header.jl") + include("gen-synthetic.jl") + include("bulkeval-synthetic.jl") +end + +n = 1000 +ncluster = 30 +m2 = 50000 +hep1 = 0.8 +hep2 = 0.8 +seedratio = 0.05 +Rratio = 1.5 +ntrials = 10 +genType = "RW" + +# pick of epsilon is based on some non-exhausting observations +# i.e. we observe on a small set of hypergraphs and decide +# which epsilon is best for each method +ϵs = [1.0, 0.3] +penaltyTypes = ["fracvol", "vol"] + +for m1 = 5000:5000:70000 + + # generate the hypergraph and save it + cluster_vertices, cluster_edges, vlabel = gen_cluster(n, m2, ncluster) + H_edges = hypergraph_SBM(n, ncluster, vlabel, m1, cluster_edges, hep1, hep2) + clusters = Vector{Vector{Int64}}() + for i = 1:ncluster + C = findall(x -> vlabel[x] == i, 1:n) + push!(clusters, C) + end + + H = elist2inc(H_edges, n) + Ht = sparse(H') + stats(H, Ht) + dataset = "HSBM-$n-$ncluster-$m1-$m2-$hep1-$hep2" + matwrite(homedir()*"/local-DHSG/data/HSBM/"*dataset*".mat", + Dict( + "H" => H, + "clusters" => clusters, + ) + ) + + # generate the Rs and save + Rs_list = Vector{Vector{Int64}}[] + for i = 1:ncluster + C = clusters[i] + Csz = length(C) + Rs = Vector{Int64}[] + for j = 1:ntrials + seed = sample(C, Int64(round(seedratio*Csz)), replace=false) + R = generate_R(H, Ht, seed, Int64(round(Rratio*Csz)), genType) + push!(Rs, R) + end + push!(Rs_list, Rs) + end + + matwrite(homedir()*"/local-DHSG/data/HSBM/"*dataset*"-Rs-$seedratio-$Rratio-$genType.mat", + Dict( + "Rs_list" => Rs_list, + ) + ) + + for i = eachindex(ϵs) + ϵ = ϵs[i] + penaltyType = penaltyTypes[i] + for j = 1:ncluster + res = bulkeval_ADHSG(H, Ht, ϵs[i], clusters[j], Rs_list[j], penaltyTypes[i]) + matwrite(homedir()*"/local-DHSG/results/HSBM/"*dataset*"-$penaltyType-ϵ-$ϵ-cluster-$j.mat", res) + end + end +end + +# Plot +ϵs = [1.0, 0.3, 1.0, 0.3] +penaltyTypes = ["fracvol", "vol", "fracvol", "vol"] +penaltyLabels = ["FracVol", "Vol", "FracVol(Greedy)", "Vol(Greedy)"] +colors = [:red, :blue, :green, :orange] +type_prefix = ["flow_", "flow_", "greedy_", "greedy_"] +fig = Figure(figure_padding=1.5, resolution=(530,265)) +axis = Axis(fig[1,1], + xtickalign=1, ytickalign=1, ygridvisible=false, xgridvisible=false, + xlabelpadding=10, ylabelpadding=10, ylabel="F1 Score", + topspinevisible=false, rightspinevisible=false) + +ratio = Vector(0.1:0.1:1.4) +savepath=homedir()*"/local-DHSG/results/HSBM/" +for i = 1:length(colors) + result = zeros(Float64, length(ratio), ncluster * ntrials) + penaltyType = penaltyTypes[i] + penaltyLabel = penaltyLabels[i] + ϵ = ϵs[i] + for j = 1:14 + m1 = j * 5000 + dataset = "HSBM-$n-$ncluster-$m1-$m2-$hep1-$hep2" + for clusterid = 1:ncluster + res = matread(savepath*dataset*"-$penaltyType-ϵ-$ϵ-cluster-$clusterid.mat") + f1score = res[type_prefix[i]*"F1scores"] + result[j, (clusterid-1)*ntrials+1:clusterid * ntrials] .= f1score + end + end + plot_ADHSG!(axis, ratio, result, penaltyLabel, colors[i]; yscale=identity) +end +axis.xticks = 0.0:0.25:1.5 +axis.yticks = 0.0:0.2:0.8 +axislegend(axis, position=(0.02, 0.7), merge = true, unique = true) +save(homedir()*"/local-DHSG/figs/HSBM/flow_vs_greedy_f1score-comparison.pdf", fig, pt_per_unit=1) \ No newline at end of file diff --git a/src/Exp6-Flow-vs-Greedy/gen-synthetic.jl b/src/Exp6-Flow-vs-Greedy/gen-synthetic.jl new file mode 100644 index 0000000..d4694cd --- /dev/null +++ b/src/Exp6-Flow-vs-Greedy/gen-synthetic.jl @@ -0,0 +1,169 @@ +""" + _draw_with_duplicate_check(e, rng, elements, duplicates) + +draw one element from elements using random generator rng, and add it to e. +If duplicates is true, then we allow duplicates, +otherwise we redraw. +""" +function _draw_with_duplicate_check(e, rng, elements, duplicates) + while true + val = rand(rng, elements) # draw a random elements + if duplicates == true + return val + else + duplicate = false + for curval in e + if val == curval # found a duplicate + duplicate = true + break + end + end + if duplicate + continue # continue the outer loop + else + return val + end + end + end +end + + +""" + random_hyperedge!(e, elements, max_size, p; rng, duplicates) + +Generate a random hyperedge where the hyperedge keeps growing with probability p. +If hyperedge size is already max_size, then we stop growing. +Vertices are drawn from elements. +""" +function random_hyperedge!(e, elements, max_size, p; rng, duplicates) + push!(e, rand(rng, elements)) + push!(e, _draw_with_duplicate_check(e, rng, elements, duplicates)) + for i in 3:max_size + rv = rand(rng) + if rv <= p + # then the hyperedge keeps growing... + push!(e, _draw_with_duplicate_check(e, rng, elements, duplicates)) + else + # we stop the hyperedge... + break + end + end + return e +end +""" + random_hyperedge(elements, max_size, p; [rng=GLBOAL_RNG, duplicates=false]) + +Generate a random hyperedge where the hyperedge keeps growing with probability p. +(Seeting p=0 will generate a random edge.) The hyperedge will grow up to max_size. +Elements of the hyperedge are chosen by `rand(rng, elements)`. + +We do not allow duplicate elements in the hyperedge if `duplicates` is false. +**This may cause problems for sets of small elements and cause the algorithm to run forever.** +""" +function random_hyperedge(elements, max_size::Int, p::Real; rng = Random.GLOBAL_RNG, duplicates::Bool=false) + e = Vector{eltype(elements)}() + return random_hyperedge!(e, elements, max_size, p; rng, duplicates) +end + +""" + random_hypergraph(n, nedges, hep; [rng = Random.GLOBAL_RNG, duplicates=false, max_size]) + random_hypergraph(elements, nedges, hep; [rng = Random.GLOBAL_RNG, duplicates=false, max_size]) + +Generate a random hypergraph where elements are chosen from 1:n. +- the default value of max_size is based on twice the expected length of the hyperedge continuation probability. + So if this is 0.4 (i.e. short hyperedges), then max_size = 2*ceil(Int, 0.4/0.6 ) = 4. +- duplicates refers to duplicate values within a hyperedge. This function may produce + duplicate hyperedges and does not check for uniqueness. If you need that, call + `unique!` on the final list of hyperedges. If you need unique sorted hyperedges, then + use `unique!(sort!.(random_hypergraph()))` +""" +function random_hypergraph(elements, nedges::Int, hep::Real; rng = Random.GLOBAL_RNG, duplicates::Bool=false, + max_size = 2+2*ceil(Int, hep/(1-hep))) + + if duplicates == false && max_size >= length(elements) + @warn "This may run forever because max_size is longer than the total list of elements but we don't allow duplicates" + end + edges = Vector{Vector{Int}}() + for i=1:nedges + push!(edges, random_hyperedge(elements, max_size, hep)) + end + return edges +end + +random_hypergraph(n::Int, nedges::Int, hep::Real; kwargs...) = random_hypergraph(1:n, nedges, hep; kwargs... ) + + +""" + planted_partition(n, m, nedges1, nedges2, hep1, hep2; [rng=Random.GLOBAL_RNG, kwargs...]) + +Generate a planted partition model with n vertices with a planted partition on the first +m of them (region: 1:m). + +We generate nedges1 in the full region and nedges2 in the planted region. +The edges in the full region have hyperedge length probability hep1 and the +edges in the planted region have length probablitiy hep2 + +See random_hypergraph for other kwargs... +""" +function planted_partition(n, m, nedges1, nedges2, hep1, hep2; kwargs...) + m < n || throw(ArgumentError("m must be strictly less than n")) + edges1 = random_hypergraph(1:n, nedges1, hep1; kwargs...) + edges2 = random_hypergraph(1:m, nedges2, hep2; kwargs...) + edges = append!(edges1, edges2) + return unique!(sort!.(edges)) +end + + +""" + hypergraph_SBM(n, cluster_vertices, vlabel, nedges1, cluster_edges, hep1, hep2; kwargs...) + +Generate a hypergraph using a simplified SBM model. +The graph has n vertices, nedges1 + nedges2 edges, ncluster clusters. +nedges1 edges are edges between different clusters and +nedges2 edges are edges within the same cluster. + +# Arguments +n: number of vertices +ncluster: number of clusters +vlabel: cluster label for each vertex +nedges1: number of edges between clusters +cluster_edges: number of edges within each cluster +hep1: hyperedge extension probability for edges between clusters +hep2: hyperedge extension probability for edges within clusters +""" +function hypergraph_SBM(n, ncluster, vlabel, nedges1, cluster_edges, hep1, hep2; kwargs...) + @assert ncluster <= n "number of clusters must not exceed number of vertices" + + # generate edges between clusters + edges = random_hypergraph(1:n, nedges1, hep1; kwargs...) + for i = 1:ncluster + # generate edges within each cluster + # vlabel is the cluster label for each vertex + # cluster_edges mark the number of edges within each cluster + C = findall(x->vlabel[x]==i, 1:n) + edgesi = random_hypergraph(C, cluster_edges[i], hep2; kwargs...) + append!(edges, edgesi) + end + return unique!(sort!.(edges)) +end + + +""" + gen_cluster(n, m, ncluster) + +Generating the cluster sizes and edge sizes for the hypergraph SBM. +Splitting n vertices and m hyperedges into ncluster clusters. +Currently the edge number is proportional to the vertex number, which is a bit unrealistic. +""" +function gen_cluster(n, m, ncluster) + cluster_vertices = zeros(Int64, ncluster) + cluster_edges = zeros(Int64, ncluster) + vlabel = rand(1:ncluster, n) + elabel = rand(1:ncluster, m) + for i = 1:ncluster + cluster_vertices[i] = length(findall(x->vlabel[x]==i, 1:n)) + cluster_edges[i] = length(findall(x->elabel[x]==i, 1:m)) + end + return cluster_vertices, cluster_edges, vlabel +end + diff --git a/src/Exp7-obj-comparison/README.md b/src/Exp7-obj-comparison/README.md new file mode 100644 index 0000000..5398917 --- /dev/null +++ b/src/Exp7-obj-comparison/README.md @@ -0,0 +1,8 @@ +# Experiment 7 Comparison with ADS on clique expansions +Run +``` +julia -p X --project dt-comparison.jl +``` +where ```X``` is the number of workers you want to spawn, e.g. 10. + +The corresponding plots are saved at ```/local-DHSG/figs/datasetname/```. \ No newline at end of file diff --git a/src/Exp7-obj-comparison/bulkeval.jl b/src/Exp7-obj-comparison/bulkeval.jl new file mode 100644 index 0000000..326d397 --- /dev/null +++ b/src/Exp7-obj-comparison/bulkeval.jl @@ -0,0 +1,181 @@ +""" + make_jobs(params, jobs) + +Put (i, j, ϵ, R) tuples into the jobs RemoteChannel. +""" +function make_jobs( + params::Vector{Tuple{Ti, Ti, Tf, Vector{Ti}, String}}, + jobs::RemoteChannel, +) where {Ti <: Integer, Tf} + for t in params + put!(jobs, t) + end + for i = 1:length(workers()) + put!(jobs, (-1, -1, -1, -1, "")) + end +end + + +""" + do_jobs(jobs, results, H, Ht) + +Worker's job function, solving the anchored densest subhypergraph problem. +""" +function do_jobs( + jobs::RemoteChannel, + results::RemoteChannel, + H::SparseMatrixCSC{Tf, Ti}, + Ht::SparseMatrixCSC{Tf, Ti}, +) where {Ti <: Integer, Tf} + while true + # the i-th ϵ, j-th R + i, j, ϵ, R, penaltyType = take!(jobs) + # terminate when all jobs are done + if i == -1 + break + end + type = "global" + if ϵ >= 1.0 + type = "local" + end + if penaltyType in ["fracvol", "vol"] + res = ADHSG_flow_density_improvement(H, Ht, R, ϵ, penaltyType;flowtol=1e-8, type=type) + put!(results, (i, j, res)) + elseif penaltyType in ["WCE", "UCE"] + res = ADSG_flow_density_improvement(H, Ht, R, ϵ;flowtol=1e-8, type=type, expansion=penaltyType) + put!(results, (i, j, res)) + else + error("unsupported penalty type.") + end + end +end + + +""" + bulkeval_ADHSG(H, Ht, Rs, ϵs, penaltyType; [dataset="HSBM", savepath=homedir()*"/local-DHSG/results", filename="placeholder"] + +Function for solving the Anchored Densest HyperSubgraph Problem +for a bunch of ϵs and Rs in parallel. +""" +function bulkeval_ADHSG( + H::SparseMatrixCSC{Tf, Ti}, + Ht::SparseMatrixCSC{Tf, Ti}, + Rs::Any, + ϵs::Vector{Tf}, + penaltyType::String; + dataset::String="HSBM", + savepath::String=homedir()*"/local-DHSG/results", + filename::String="placeholder", +) where {Ti <: Integer, Tf} + ntrials = length(Rs) + nϵ = length(ϵs) + + # set up jobs and results remote channel + jobs = RemoteChannel(() -> Channel{Tuple}(ntrials * nϵ + length(workers()))) + results = RemoteChannel(() -> Channel{Tuple}(ntrials * nϵ)) + tasks = Tuple{Ti, Ti, Tf, Vector{Ti}, String}[] + for i = 1:nϵ + for j = 1:ntrials + push!(tasks, (i, j, ϵs[i], Rs[j], penaltyType)) + end + end + make_jobs(tasks, jobs) + for p in workers() + remote_do(do_jobs, p, jobs, results, H, Ht) + end + njob = length(tasks) + Volpenalizeddensities = zeros(Tf, nϵ, ntrials) + FracVolpenalizeddensities = zeros(Tf, nϵ, ntrials) + while njob > 0 + i, j, res = take!(results) + njob -= 1 + println("$njob left.") + order = H_order(Ht) + vol_penalty = volume_penalty(H, Rs[j], size(H, 2), ϵs[i]) + fracvol_penalty = frac_volume_penalty(H, Ht, Rs[j], size(H, 2), ϵs[i]) + Volpenalizeddensities[i, j] = edensity(H, order, vol_penalty, res["optsol"]) + FracVolpenalizeddensities[i, j] = edensity(H, order, fracvol_penalty, res["optsol"]) + end + res = Dict( + "epss" => ϵs, + "Rs" => Rs, + "ntrial" => length(Rs), + "Volpenalizeddensities" => Volpenalizeddensities, + "FractionalVolpenalizeddensities" => FracVolpenalizeddensities, + ) + # save the results + matwrite(savepath*"/"*dataset*"/"*filename*".mat", res) + return res +end + + +""" + bulk_generate_R(H, Ht, seedsize, Rsize, ntrial, genType; [dataset="HSBM", filefolder=homedir()*"/local-DHSG/data/", prob=prob]) + +Generate Rs for a hypergraph H. We randomly sample seedsize vertices as seeds, +then expand them to a R using method genType. We generate ntrial Rs. +""" +function bulk_generate_R( + H::SparseMatrixCSC{Tf, Ti}, + Ht::SparseMatrixCSC{Tf, Ti}, + seedsize::Ti = 5, + Rsize::Ti = 100, + ntrial::Ti = 10, + genType::String = "RW"; + dataset::String = "HSBM", + filefolder = homedir()*"/local-DHSG/data/", + prob::Tf = prob, +) where {Ti <: Integer, Tf <: AbstractFloat} + Rs = Vector{Ti}[] + m, n = size(H) + for i = 1:ntrial + seedset = sample(1:n, seedsize, replace=false) + R = generate_R(H, Ht, seedset, Rsize, genType; prob=prob) + push!(Rs, R) + end + savepath = filefolder * dataset * "/Rs-$seedsize-$Rsize-ntrial-$ntrial-$genType.mat" + # save the results + matwrite(savepath, Dict("Rs"=>Rs)) + return Rs +end + + +""" + plot_runningtime(ϵs, results, labels, cols, savepath; [yscale=identity]) + +Plot the mean and stderr of the results. +""" +function plot_objective_ratio( + ϵs::Vector{Tf}, + results::Vector{Any}, + labels, + cols, + savepath; + yscale = identity, +) where{Tf} + size_inches = (4, 3) + size_pt = 90 .* size_inches + fig = Figure(resolution = size_pt, fontsize = 16, figure_padding=1) + axis = Axis(fig[1, 1], ylabel="ratio between objective values", yscale=yscale) + for i = 1:length(labels) + result = results[i] + label = labels[i] + col = cols[i] + result_mean = mean(result, dims=2)[:, 1] + lines!(axis, ϵs, result_mean, label=label, color=(col, 1)) + scatter!(axis, ϵs, result_mean, color=(col, 1)) + n = size(result, 1) + lowcurve = zeros(Tf, n) + upcurve = zeros(Tf, n) + for i = 1:n + stderr = sem(result[i, :]) + low = result_mean[i] - stderr + up = min(1.0, result_mean[i] + stderr) + lowcurve[i] = low + upcurve[i] = up + end + band!(axis, ϵs, upcurve, lowcurve, color = (col, 0.2)) + end + axislegend(axis, position=:rb, merge = true, unique = true) + save(savepath, fig) +end \ No newline at end of file diff --git a/src/Exp7-obj-comparison/obj-comparison.jl b/src/Exp7-obj-comparison/obj-comparison.jl new file mode 100644 index 0000000..f74279e --- /dev/null +++ b/src/Exp7-obj-comparison/obj-comparison.jl @@ -0,0 +1,64 @@ +using Distributed +@everywhere begin + include("../header.jl") + include("bulkeval.jl") +end + +for dataset in [ + "walmart-trips", + "trivago-clicks", + "threads-ask-ubuntu", + "threads-math-sx", + ] + # preprocess and load data + preprocess_graph(dataset, true) + data = load_graph(dataset, true) + H = data["H"] + Ht = sparse(H') + + # show some basic stats + @show dataset, size(H) + stats(H, Ht) + + # generate 100 Rs with |R| = 200 expanded from 10 seeds + ntrial = 100 + ϵs = Vector(0.0:0.1:1.2) + seedsize = 10 + Rsize = 200 + + # generate R using RW-geo with stopping probability 0.3 + prob = 0.3 + genType = "RW-geo" + Rdataname = "Rs-$seedsize-$Rsize-ntrial-$ntrial-$genType" + datapath = homedir()*"/local-DHSG/data/"*dataset*"/"*Rdataname*".mat" + + # generate Rs + #Rs = bulk_generate_R(H, Ht, seedsize, Rsize, ntrial, genType; dataset=dataset, prob=prob) + + # evaluate running time + penaltyTypes = ["fracvol", "vol", "UCE", "WCE"] + Volpenalizeddensities_results = [] + FractionalVolpenalizeddensities_results = [] + for penaltyType in penaltyTypes + #res = bulkeval_ADHSG(H, Ht, Rs, ϵs, penaltyType; dataset=dataset, filename="$penaltyType-Rs-$seedsize-$Rsize-ntrial-$ntrial-$genType") + res = matread(homedir()*"/local-DHSG/results/"*dataset*"/$penaltyType-Rs-$seedsize-$Rsize-ntrial-$ntrial-$genType.mat") + push!(Volpenalizeddensities_results, res["Volpenalizeddensities"]) + push!(FractionalVolpenalizeddensities_results, res["FractionalVolpenalizeddensities"]) + end + + for i = [2, 3, 4, 1] + @. FractionalVolpenalizeddensities_results[i] /= FractionalVolpenalizeddensities_results[1] + end + + for i = [1, 3, 4, 2] + @. Volpenalizeddensities_results[i] /= Volpenalizeddensities_results[2] + end + # plot + penaltyNames = ["FracVol", "Vol", "UCE", "WCE"] + cols = [:red, :blue, :green, :purple] + savepath = homedir()*"/local-DHSG/figs/"*dataset*"/"*dataset*"-fractionalvolpenalized-density-comparison.pdf" + plot_objective_ratio(ϵs, FractionalVolpenalizeddensities_results, penaltyNames, cols, savepath;) + savepath = homedir()*"/local-DHSG/figs/"*dataset*"/"*dataset*"-volpenalized-density-comparison.pdf" + plot_objective_ratio(ϵs, Volpenalizeddensities_results, penaltyNames, cols, savepath;) +end +