From ccbc6b31a101ad107974b8e29500e07d3da84c73 Mon Sep 17 00:00:00 2001
From: Charles Dufour <34485907+dufourc1@users.noreply.github.com>
Date: Thu, 21 Sep 2023 16:47:50 +0200
Subject: [PATCH] Bandwidth fix (#46)

* add test for float bandwidth

* add more tests for bandwith

* fix bug

* format

* fix docs

* refactor bandwidth conversion
---
 docs/make.jl                    | 35 ++++++++++++------------
 src/assignment.jl               | 18 ++++++-------
 src/config_rules/accept_rule.jl |  4 +--
 src/history.jl                  |  8 +++---
 src/optimize.jl                 | 48 +++++++++++----------------------
 src/proposal.jl                 | 12 ++++-----
 test/data_tests/utils.jl        | 10 +++----
 test/error_handling_tests.jl    | 20 +++++++-------
 test/oracle_bandwidth_test.jl   | 22 +++++++--------
 test/pipeline_test.jl           | 41 +++++++++++++++++-----------
 test/proposal_test.jl           |  2 +-
 test/simple_test_example.jl     | 14 +++++-----
 test/starting_labels_test.jl    |  6 ++---
 13 files changed, 119 insertions(+), 121 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 9c328d2..e4219ec 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,24 +2,25 @@ using NetworkHistogram
 using Documenter
 
 DocMeta.setdocmeta!(NetworkHistogram, :DocTestSetup, :(using NetworkHistogram);
-                    recursive = true)
+    recursive = true)
 
 makedocs(;
-         modules = [NetworkHistogram],
-         authors = "Jake Grainger, Charles Dufour",
-         #repo = "github.com/SDS-EPFL/NetworkHistogram.jl.git",
-         sitename = "NetworkHistogram.jl",
-         #format = Documenter.HTML(;
-         #                         prettyurls = get(ENV, "CI", "false") == "true",
-         #                         canonical = "https://SDS-EPFL.github.io/NetworkHistogram.jl",
-         #                         edit_link = "main",
-         #                         assets = String[]),
-         pages = [
-             "Home" => "index.md",
-             "API Reference" => "api.md",
-             "Optimization hyperparameters" => "rules.md",
-             "Development" => "internals.md",
-         ])
+    modules = [NetworkHistogram],
+    authors = "Jake Grainger, Charles Dufour",
+    #repo = "github.com/SDS-EPFL/NetworkHistogram.jl.git",
+    sitename = "NetworkHistogram.jl",
+    #format = Documenter.HTML(;
+    #                         prettyurls = get(ENV, "CI", "false") == "true",
+    #                         canonical = "https://SDS-EPFL.github.io/NetworkHistogram.jl",
+    #                         edit_link = "main",
+    #                         assets = String[]),
+    pages = [
+        "Home" => "index.md",
+        "API Reference" => "api.md",
+        "Optimization hyperparameters" => "rules.md",
+        "Development" => "internals.md",
+    ],
+    checkdocs = :none)
 
 deploydocs(;
-           repo = "github.com/SDS-EPFL/NetworkHistogram.jl.git")
+    repo = "github.com/SDS-EPFL/NetworkHistogram.jl.git")
diff --git a/src/assignment.jl b/src/assignment.jl
index 540db81..b191acc 100644
--- a/src/assignment.jl
+++ b/src/assignment.jl
@@ -31,14 +31,14 @@ mutable struct Assignment{T}
 
         estimated_theta = realized ./ counts
         likelihood = compute_log_likelihood(number_groups, estimated_theta, counts,
-                                            size(A, 1))
+            size(A, 1))
 
         new{T}(group_size,
-               node_labels,
-               counts,
-               realized,
-               estimated_theta,
-               likelihood)
+            node_labels,
+            counts,
+            realized,
+            estimated_theta,
+            likelihood)
     end
 end
 
@@ -84,9 +84,9 @@ where ``\\hat{\\theta}_{ab}`` is the estimated probability of an edge between co
 """
 function compute_log_likelihood(assignment::Assignment)
     compute_log_likelihood(length(assignment.group_size),
-                           assignment.estimated_theta,
-                           assignment.counts,
-                           sum(assignment.group_size))
+        assignment.estimated_theta,
+        assignment.counts,
+        sum(assignment.group_size))
 end
 
 function deepcopy!(a::Assignment, b::Assignment)
diff --git a/src/config_rules/accept_rule.jl b/src/config_rules/accept_rule.jl
index 1d7db37..4960d1c 100644
--- a/src/config_rules/accept_rule.jl
+++ b/src/config_rules/accept_rule.jl
@@ -15,8 +15,8 @@ Return the updated `current` assignment based on the `accept_rule`.
 accept_reject_update!
 
 function accept_reject_update!(history::GraphOptimizationHistory, iteration::Int,
-                               proposal::Assignment,
-                               current::Assignment, accept_rule::Strict)
+    proposal::Assignment,
+    current::Assignment, accept_rule::Strict)
     if proposal.likelihood > current.likelihood
         deepcopy!(current, proposal)
     end
diff --git a/src/history.jl b/src/history.jl
index 8cf0e59..381f175 100644
--- a/src/history.jl
+++ b/src/history.jl
@@ -15,10 +15,10 @@ end
 """
 function initialize_history(best, current, proposal, ::Val{true})
     history = MVHistory(Dict([
-                                 :proposal_likelihood => QHistory(Float64),
-                                 :current_likelihood => QHistory(Float64),
-                                 :best_likelihood => QHistory(Float64),
-                             ]))
+        :proposal_likelihood => QHistory(Float64),
+        :current_likelihood => QHistory(Float64),
+        :best_likelihood => QHistory(Float64),
+    ]))
     push!(history, :proposal_likelihood, 0, proposal.likelihood)
     push!(history, :current_likelihood, 0, current.likelihood)
     push!(history, :best_likelihood, 0, best.likelihood)
diff --git a/src/optimize.jl b/src/optimize.jl
index 6fc09ab..f432c0d 100644
--- a/src/optimize.jl
+++ b/src/optimize.jl
@@ -18,8 +18,8 @@ Computes the graph histogram approximation.
 # Arguments
 - `A`: adjacency matrix of a simple graph
 
-- `h`: bandwidth of the graph histogram (number of nodes in a group or percentage of nodes in a
-    group)
+- `h`: bandwidth of the graph histogram (number of nodes in a group or percentage (in [0,1]) of
+    nodes in a group)
 
 - `record_trace` (optional): whether to record the trace of the optimization process and return
     it as part of the output. Default is `true`.
@@ -55,18 +55,17 @@ julia> loglikelihood = out.likelihood
 ```
 """
 function graphhist(A; h = select_bandwidth(A), maxitr = 1000,
-                   swap_rule::NodeSwapRule = RandomNodeSwap(),
-                   starting_assignment_rule::StartingAssignment = RandomStart(),
-                   accept_rule::AcceptRule = Strict(),
-                   stop_rule::StopRule = PreviousBestValue(3), record_trace = true)
+    swap_rule::NodeSwapRule = RandomNodeSwap(),
+    starting_assignment_rule::StartingAssignment = RandomStart(),
+    accept_rule::AcceptRule = Strict(),
+    stop_rule::StopRule = PreviousBestValue(3), record_trace = true)
     checkadjacency(A)
-    h = sanitize_bandwidth(h, size(A, 1))
     @assert maxitr > 0
 
     return _graphhist(A, Val{record_trace}(), h = h, maxitr = maxitr, swap_rule = swap_rule,
-                      starting_assignment_rule = starting_assignment_rule,
-                      accept_rule = accept_rule,
-                      stop_rule = stop_rule)
+        starting_assignment_rule = starting_assignment_rule,
+        accept_rule = accept_rule,
+        stop_rule = stop_rule)
 end
 
 """
@@ -75,9 +74,9 @@ end
 Internal version of `graphhist` which is type stable.
 """
 function _graphhist(A, record_trace = Val{true}(); h, maxitr, swap_rule,
-                    starting_assignment_rule, accept_rule, stop_rule)
+    starting_assignment_rule, accept_rule, stop_rule)
     best, current, proposal, history = initialize(A, h, starting_assignment_rule,
-                                                  record_trace)
+        record_trace)
 
     for i in 1:maxitr
         proposal = create_proposal!(history, i, proposal, current, A, swap_rule)
@@ -104,8 +103,8 @@ function graphhist_format_output(best, history::NoTraceHistory)
 end
 
 function update_best!(history::GraphOptimizationHistory, iteration::Int,
-                      current::Assignment,
-                      best::Assignment)
+    current::Assignment,
+    best::Assignment)
     if current.likelihood > best.likelihood
         update_best!(history, iteration, current.likelihood)
         deepcopy!(best, current)
@@ -127,9 +126,9 @@ function initialize(A, h, starting_assignment_rule, record_trace)
     return best, current, proposal, history
 end
 
-function select_bandwidth(A, type = "degs", alpha = 1, c = 1)
+function select_bandwidth(A, type = "degs", alpha = 1, c = 1)::Int
     h = oracle_bandwidth(A, type, alpha, c)
-    return sanitize_bandwidth(h, size(A, 1))
+    return max(2, min(size(A)[1], round(Int, h)))
 end
 
 """
@@ -154,7 +153,7 @@ function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1
 
     n = size(A, 1)
     midPt = collect(max(1, round(Int, (n ÷ 2 - c * sqrt(n)))):round(Int,
-                                                                    (n ÷ 2 + c * sqrt(n))))
+        (n ÷ 2 + c * sqrt(n))))
     rhoHat_inv = inv(sum(A) / (n * (n - 1)))
 
     # Rank-1 graphon estimate via fhat(x,y) = mult*u(x)*u(y)*pinv(rhoHat);
@@ -180,18 +179,3 @@ function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1
     #estMSqrd = 2*mult^2*(lmfit_coef[2]*length(uMid)/2+lmfit_coef[1])^2*lmfit_coef[2]^2*rhoHat_inv^2*(n+1)^2
     return h[1]
 end
-
-function sanitize_bandwidth(h::Real, n::Int)::Int
-    h = max(2, min(n, round(Int, h)))
-    lastGroupSize = n % h
-
-    if lastGroupSize == 1
-        @warn "Correcting bandwidth to avoid singleton final group."
-    end
-    # step down h, to avoid singleton final group
-    while lastGroupSize == 1 && h > 2
-        h -= 1
-        lastGroupSize = n % h
-    end
-    return h
-end
diff --git a/src/proposal.jl b/src/proposal.jl
index 230a222..f4fb594 100644
--- a/src/proposal.jl
+++ b/src/proposal.jl
@@ -12,8 +12,8 @@ proposal is stored in the history.
     The `proposal` assignment is modified in place to avoid unnecessary memory allocation.
 """
 function create_proposal!(history::GraphOptimizationHistory, iteration::Int,
-                          proposal::Assignment,
-                          current::Assignment, A, swap_rule)
+    proposal::Assignment,
+    current::Assignment, A, swap_rule)
     swap = select_swap(current, A, swap_rule)
     make_proposal!(proposal, current, swap, A)
     update_proposal!(history, iteration, proposal.likelihood)
@@ -78,20 +78,20 @@ function update_observed!(proposal::Assignment, swap::Tuple{Int, Int}, A)
         if A[i, swap[1]] == 1
             proposal.realized[group_node_1, group_i] -= 1
             proposal.realized[group_i, group_node_1] = proposal.realized[group_node_1,
-                                                                         group_i]
+                group_i]
 
             proposal.realized[group_node_2, group_i] += 1
             proposal.realized[group_i, group_node_2] = proposal.realized[group_node_2,
-                                                                         group_i]
+                group_i]
         end
         if A[i, swap[2]] == 1
             proposal.realized[group_node_2, group_i] -= 1
             proposal.realized[group_i, group_node_2] = proposal.realized[group_node_2,
-                                                                         group_i]
+                group_i]
 
             proposal.realized[group_node_1, group_i] += 1
             proposal.realized[group_i, group_node_1] = proposal.realized[group_node_1,
-                                                                         group_i]
+                group_i]
         end
     end
 
diff --git a/test/data_tests/utils.jl b/test/data_tests/utils.jl
index feac516..d37db8a 100644
--- a/test/data_tests/utils.jl
+++ b/test/data_tests/utils.jl
@@ -1,13 +1,13 @@
 @testset "Data utils" begin
     A = [0 0 0 1
-         0 0 0 0
-         1 0 0 1
-         0 0 1 0]
+        0 0 0 0
+        1 0 0 1
+        0 0 1 0]
 
     @testset "drop isolated vertices" begin
         B = NetworkHistogram.drop_isolated_vertices(A)
         @test B == [0 1 1
-                    1 0 1
-                    0 1 0]
+            1 0 1
+            0 1 0]
     end
 end
diff --git a/test/error_handling_tests.jl b/test/error_handling_tests.jl
index 92f772b..76fae2b 100644
--- a/test/error_handling_tests.jl
+++ b/test/error_handling_tests.jl
@@ -2,11 +2,11 @@
     @testset "Adjacency matrix" begin
         As = [
             [0 1
-             0 0], [1 1
-                    1 0], [0 2
-                           2 0], [0 1
-                                  1 0
-                                  0 1],
+                0 0], [1 1
+                1 0], [0 2
+                2 0], [0 1
+                1 0
+                0 1],
         ]
         for A in As
             @test_throws AssertionError graphhist(A, h = 2)
@@ -17,9 +17,11 @@
     end
     @testset "maxitr" begin
         @test_throws AssertionError graphhist([0 1; 1 0], h = 2,
-                                              maxitr = -1)
+            maxitr = -1)
+    end
+    @testset "h" begin
+        for h in (3, -1, 1.1, -0.1)
+            @test_throws AssertionError graphhist([0 1; 1 0], h = h)
+        end
     end
-    @testset "h" begin for h in (3, -1, 1.1, -0.1)
-        @test_throws AssertionError graphhist([0 1; 1 0], h = h)
-    end end
 end
diff --git a/test/oracle_bandwidth_test.jl b/test/oracle_bandwidth_test.jl
index e19d1bc..27e1f26 100644
--- a/test/oracle_bandwidth_test.jl
+++ b/test/oracle_bandwidth_test.jl
@@ -1,18 +1,18 @@
 @testset "oracle bandwidth test" begin
     A = [0 0 1 0 1 0 1 1 0 1
-         0 0 1 1 1 1 1 1 0 0
-         1 1 0 1 0 0 0 0 1 0
-         0 1 1 0 1 0 1 0 0 0
-         1 1 0 1 0 0 1 0 0 1
-         0 1 0 0 0 0 0 1 0 0
-         1 1 0 1 1 0 0 1 0 1
-         1 1 0 0 0 1 1 0 0 1
-         0 0 1 0 0 0 0 0 0 1
-         1 0 0 0 1 0 1 1 1 0]
+        0 0 1 1 1 1 1 1 0 0
+        1 1 0 1 0 0 0 0 1 0
+        0 1 1 0 1 0 1 0 0 0
+        1 1 0 1 0 0 1 0 0 1
+        0 1 0 0 0 0 0 1 0 0
+        1 1 0 1 1 0 0 1 0 1
+        1 1 0 0 0 1 1 0 0 1
+        0 0 1 0 0 0 0 0 0 1
+        1 0 0 0 1 0 1 1 1 0]
     h = NetworkHistogram.oracle_bandwidth(A)
     rho = sum(A) / (size(A, 1) * (size(A, 1) - 1))
     h_true_nethist = 2.643731 # version 0.2.3 from nethist package
+    h_clean = 3
     @test h≈h_true_nethist atol=1e-4
-    h_clean = NetworkHistogram.sanitize_bandwidth(h, size(A, 1))
-    @test h_clean == 2
+    @test NetworkHistogram.select_bandwidth(A) == h_clean
 end
diff --git a/test/pipeline_test.jl b/test/pipeline_test.jl
index 98bc38d..6470297 100644
--- a/test/pipeline_test.jl
+++ b/test/pipeline_test.jl
@@ -1,16 +1,27 @@
 @testset "Pipeline" begin
     @testset "dummy run" begin
         A = [0 0 1 0 1 0 1 1 0 1
-             0 0 1 1 1 1 1 1 0 0
-             1 1 0 1 0 0 0 0 1 0
-             0 1 1 0 1 0 1 0 0 0
-             1 1 0 1 0 0 1 0 0 1
-             0 1 0 0 0 0 0 1 0 0
-             1 1 0 1 1 0 0 1 0 1
-             1 1 0 0 0 1 1 0 0 1
-             0 0 1 0 0 0 0 0 0 1
-             1 0 0 0 1 0 1 1 1 0]
-        estimated = graphhist(A; h = 0.5)
+            0 0 1 1 1 1 1 1 0 0
+            1 1 0 1 0 0 0 0 1 0
+            0 1 1 0 1 0 1 0 0 0
+            1 1 0 1 0 0 1 0 0 1
+            0 1 0 0 0 0 0 1 0 0
+            1 1 0 1 1 0 0 1 0 1
+            1 1 0 0 0 1 1 0 0 1
+            0 0 1 0 0 0 0 0 0 1
+            1 0 0 0 1 0 1 1 1 0]
+        @testset "run bandwidth float" begin
+            estimated = graphhist(A; h = 0.5)
+            @test all(estimated.graphhist.θ .>= 0.0)
+            @test all(estimated.graphhist.θ .<= 1.0)
+            @test size(estimated.graphhist.θ) == (2, 2)
+        end
+        @testset "run bandwidth int" begin
+            estimated = graphhist(A; h = 5)
+            @test all(estimated.graphhist.θ .>= 0.0)
+            @test all(estimated.graphhist.θ .<= 1.0)
+            @test size(estimated.graphhist.θ) == (2, 2)
+        end
     end
 
     @testset "associative stochastic block model" begin
@@ -19,13 +30,13 @@
         for (name, adjacency) in adjacencies
             @testset "$name" begin
                 estimated, history = graphhist(adjacency; h = 0.3,
-                                               stop_rule = PreviousBestValue(100),
-                                               starting_assignment_rule = OrderedStart())
+                    stop_rule = PreviousBestValue(100),
+                    starting_assignment_rule = OrderedStart())
                 @test all(estimated.θ .>= 0.0)
                 estimated, history = graphhist(adjacency; h = 0.3,
-                                               stop_rule = PreviousBestValue(100),
-                                               starting_assignment_rule = OrderedStart(),
-                                               record_trace = false)
+                    stop_rule = PreviousBestValue(100),
+                    starting_assignment_rule = OrderedStart(),
+                    record_trace = false)
                 @test all(estimated.θ .>= 0.0)
             end
         end
diff --git a/test/proposal_test.jl b/test/proposal_test.jl
index 31bffca..0328b8e 100644
--- a/test/proposal_test.jl
+++ b/test/proposal_test.jl
@@ -5,7 +5,7 @@
     proposal = deepcopy(assignment)
     NetworkHistogram.make_proposal!(proposal, assignment, swap, A)
     reference_proposal = NetworkHistogram.Assignment(A, [1, 2, 1, 1, 1, 2, 2, 2],
-                                                     group_size)
+        group_size)
 
     @testset "update labels" begin
         @test proposal.node_labels[swap[1]] == reference_proposal.node_labels[swap[1]] == 2
diff --git a/test/simple_test_example.jl b/test/simple_test_example.jl
index 1fbb49a..cc40bc8 100644
--- a/test/simple_test_example.jl
+++ b/test/simple_test_example.jl
@@ -6,13 +6,13 @@ Returns A, node_labels, group_size, assignment
 """
 function make_simple_example()
     A = [0 1 1 1 0 0 1 0
-         1 0 1 1 0 0 0 0
-         1 1 0 0 0 0 0 0
-         1 1 0 0 0 0 0 1
-         0 0 0 0 0 1 1 1
-         0 0 0 0 1 0 1 1
-         1 0 0 0 1 1 0 0
-         0 0 0 1 1 1 0 0]
+        1 0 1 1 0 0 0 0
+        1 1 0 0 0 0 0 0
+        1 1 0 0 0 0 0 1
+        0 0 0 0 0 1 1 1
+        0 0 0 0 1 0 1 1
+        1 0 0 0 1 1 0 0
+        0 0 0 1 1 1 0 0]
     node_labels = [1, 1, 1, 1, 2, 2, 2, 2]
     group_size = NetworkHistogram.GroupSize(8, 4)
     assignment = NetworkHistogram.Assignment(A, node_labels, group_size)
diff --git a/test/starting_labels_test.jl b/test/starting_labels_test.jl
index 6b9ab2e..e94c3e4 100644
--- a/test/starting_labels_test.jl
+++ b/test/starting_labels_test.jl
@@ -22,20 +22,20 @@ end
 
     @testset "random start" begin
         node_labels, group_size = NetworkHistogram.initialize_node_labels(A, h,
-                                                                          RandomStart())
+            RandomStart())
         test_basic_node_labels(node_labels, group_size)
     end
 
     @testset "ordered start" begin
         node_labels, group_size = NetworkHistogram.initialize_node_labels(A, h,
-                                                                          OrderedStart())
+            OrderedStart())
         test_basic_node_labels(node_labels, group_size)
         @test node_labels == sort(node_labels)
     end
 
     @testset "eigenvalue start" begin
         node_labels, group_size = NetworkHistogram.initialize_node_labels(A, h,
-                                                                          EigenStart())
+            EigenStart())
         test_basic_node_labels(node_labels, group_size)
     end
 end