From 9c2787de73da3096eb1db7ee3bcfd05d10b948c1 Mon Sep 17 00:00:00 2001
From: Benjamin Lieser <benjamin.lieser@mpinat.mpg.de>
Date: Tue, 8 Oct 2024 10:57:47 +0200
Subject: [PATCH] Cdf testing with Kolmogorov Smirnov (#1494)

---
 rand_distr/CHANGELOG.md |   1 +
 rand_distr/tests/cdf.rs | 197 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+)
 create mode 100644 rand_distr/tests/cdf.rs
diff --git a/rand_distr/CHANGELOG.md b/rand_distr/CHANGELOG.md
index 93756eb705..a19641752f 100644
--- a/rand_distr/CHANGELOG.md
+++ b/rand_distr/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - The `serde1` feature has been renamed `serde` (#1477)
 - Fix panic in Binomial (#1484)
 - Move some of the computations in Binomial from `sample` to `new` (#1484)
+- Add Kolmogorov Smirnov test for sampling of `Normal` and `Binomial` (#1494)
 
 ### Added
 - Add plots for `rand_distr` distributions to documentation (#1434)
diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs
new file mode 100644
index 0000000000..71b808d241
--- /dev/null
+++ b/rand_distr/tests/cdf.rs
@@ -0,0 +1,197 @@
+// Copyright 2021 Developers of the Rand project.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use core::f64;
+
+use num_traits::AsPrimitive;
+use rand::SeedableRng;
+use rand_distr::{Distribution, Normal};
+use special::Beta;
+use special::Primitive;
+
+// [1] Nonparametric Goodness-of-Fit Tests for Discrete Null Distributions
+//     by Taylor B. Arnold and John W. Emerson
+//     http://www.stat.yale.edu/~jay/EmersonMaterials/DiscreteGOF.pdf
+
+/// Empirical Cumulative Distribution Function (ECDF)
+struct Ecdf {
+    sorted_samples: Vec<f64>,
+}
+
+impl Ecdf {
+    fn new(mut samples: Vec<f64>) -> Self {
+        samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        Self {
+            sorted_samples: samples,
+        }
+    }
+
+    /// Returns the step points of the ECDF
+    /// The ECDF is a step function that increases by 1/n at each sample point
+    /// The function is continuous from the right, so we give the bigger value at the step points
+    /// First point is (-inf, 0.0), last point is (max(samples), 1.0)
+    fn step_points(&self) -> Vec<(f64, f64)> {
+        let mut points = Vec::with_capacity(self.sorted_samples.len() + 1);
+        let mut last = f64::NEG_INFINITY;
+        let mut count = 0;
+        let n = self.sorted_samples.len() as f64;
+        for &x in &self.sorted_samples {
+            if x != last {
+                points.push((last, count as f64 / n));
+                last = x;
+            }
+            count += 1;
+        }
+        points.push((last, count as f64 / n));
+        points
+    }
+}
+
+fn kolmogorov_smirnov_statistic_continuous(ecdf: Ecdf, cdf: impl Fn(f64) -> f64) -> f64 {
+    // We implement equation (3) from [1]
+
+    let mut max_diff: f64 = 0.;
+
+    let step_points = ecdf.step_points(); // x_i in the paper
+    for i in 1..step_points.len() {
+        let (x_i, f_i) = step_points[i];
+        let (_, f_i_1) = step_points[i - 1];
+        let cdf_i = cdf(x_i);
+        let max_1 = (cdf_i - f_i).abs();
+        let max_2 = (cdf_i - f_i_1).abs();
+
+        max_diff = max_diff.max(max_1).max(max_2);
+    }
+    max_diff
+}
+
+fn kolmogorov_smirnov_statistic_discrete(ecdf: Ecdf, cdf: impl Fn(i64) -> f64) -> f64 {
+    // We implement equation (4) from [1]
+
+    let mut max_diff: f64 = 0.;
+
+    let step_points = ecdf.step_points(); // x_i in the paper
+    for i in 1..step_points.len() {
+        let (x_i, f_i) = step_points[i];
+        let (_, f_i_1) = step_points[i - 1];
+        let max_1 = (cdf(x_i as i64) - f_i).abs();
+        let max_2 = (cdf(x_i as i64 - 1) - f_i_1).abs(); // -1 is the same as -epsilon, because we have integer support
+
+        max_diff = max_diff.max(max_1).max(max_2);
+    }
+    max_diff
+}
+
+const SAMPLE_SIZE: u64 = 1_000_000;
+
+fn critical_value() -> f64 {
+    // If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001).
+    // Passing this does not prove that the sampler is correct but is a good indication.
+    1.95 / (SAMPLE_SIZE as f64).sqrt()
+}
+
+fn sample_ecdf<T>(seed: u64, dist: impl Distribution<T>) -> Ecdf
+where
+    T: AsPrimitive<f64>,
+{
+    let mut rng = rand::rngs::SmallRng::seed_from_u64(seed);
+    let samples = (0..SAMPLE_SIZE)
+        .map(|_| dist.sample(&mut rng).as_())
+        .collect();
+    Ecdf::new(samples)
+}
+
+/// Tests a distribution against an analytical CDF.
+/// The CDF has to be continuous.
+pub fn test_continuous(seed: u64, dist: impl Distribution<f64>, cdf: impl Fn(f64) -> f64) {
+    let ecdf = sample_ecdf(seed, dist);
+    let ks_statistic = kolmogorov_smirnov_statistic_continuous(ecdf, cdf);
+
+    let critical_value = critical_value();
+
+    println!("KS statistic: {}", ks_statistic);
+    println!("Critical value: {}", critical_value);
+    assert!(ks_statistic < critical_value);
+}
+
+/// Tests a distribution over integers against an analytical CDF.
+/// The analytical CDF must not have jump points which are not integers.
+pub fn test_discrete<I: AsPrimitive<f64>>(
+    seed: u64,
+    dist: impl Distribution<I>,
+    cdf: impl Fn(i64) -> f64,
+) {
+    let ecdf = sample_ecdf(seed, dist);
+    let ks_statistic = kolmogorov_smirnov_statistic_discrete(ecdf, cdf);
+
+    // This critical value is bigger than it could be for discrete distributions, but because of large sample sizes this should not matter too much
+    let critical_value = critical_value();
+
+    println!("KS statistic: {}", ks_statistic);
+    println!("Critical value: {}", critical_value);
+    assert!(ks_statistic < critical_value);
+}
+
+fn normal_cdf(x: f64, mean: f64, std_dev: f64) -> f64 {
+    0.5 * ((mean - x) / (std_dev * f64::consts::SQRT_2)).erfc()
+}
+
+#[test]
+fn normal() {
+    let parameters = [
+        (0.0, 1.0),
+        (0.0, 0.1),
+        (1.0, 10.0),
+        (1.0, 100.0),
+        (-1.0, 0.00001),
+        (-1.0, 0.0000001),
+    ];
+
+    for (seed, (mean, std_dev)) in parameters.into_iter().enumerate() {
+        test_continuous(seed as u64, Normal::new(mean, std_dev).unwrap(), |x| {
+            normal_cdf(x, mean, std_dev)
+        });
+    }
+}
+
+fn binomial_cdf(k: i64, p: f64, n: u64) -> f64 {
+    if k < 0 {
+        return 0.0;
+    }
+    let k = k as u64;
+    if k >= n {
+        return 1.0;
+    }
+
+    let a = (n - k) as f64;
+    let b = k as f64 + 1.0;
+
+    let q = 1.0 - p;
+
+    let ln_beta_ab = a.ln_beta(b);
+
+    q.inc_beta(a, b, ln_beta_ab)
+}
+
+#[test]
+fn binomial() {
+    let parameters = [
+        (0.5, 10),
+        (0.5, 100),
+        (0.1, 10),
+        (0.0000001, 1000000),
+        (0.0000001, 10),
+        (0.9999, 2),
+    ];
+
+    for (seed, (p, n)) in parameters.into_iter().enumerate() {
+        test_discrete(seed as u64, rand_distr::Binomial::new(n, p).unwrap(), |k| {
+            binomial_cdf(k, p, n)
+        });
+    }
+}