histogram: fix percentiles for empty histograms (#116)

When the histogram is empty, the percentiles function currently returns the first bucket for the standard histogram and panics due to an underflow in the sparse histogram. Change the signature to return an Optional value, which is set to None in the empty histogram case for both standard and sparse histograms. Also, add a batched percentile lookup for the sparse histogram to bring the interface in line with the standard histogram.
pelikan-io · Jun 12, 2024 · 696f958 · 696f958
1 parent 8e41e0d
commit 696f958
Show file tree

Hide file tree

Showing 4 changed files with 132 additions and 54 deletions.
diff --git a/histogram/Cargo.toml b/histogram/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "histogram"
-version = "0.10.2"
+version = "0.11.0"
 edition = "2021"
 authors = ["Brian Martin <brian@pelikan.io>"]
 license = "MIT OR Apache-2.0"

diff --git a/histogram/src/atomic.rs b/histogram/src/atomic.rs
@@ -97,50 +97,69 @@ mod test {
         let percentiles = histogram.drain();
         assert_eq!(
             percentiles.percentile(50.0),
-            Ok(Bucket {
+            Ok(Some(Bucket {
                 count: 1,
                 range: 50..=50,
-            })
+            }))
         );
         histogram.increment(1000).unwrap();
         // after another load the map is empty
         let percentiles = histogram.drain();
         assert_eq!(
             percentiles.percentile(50.0),
-            Ok(Bucket {
+            Ok(Some(Bucket {
                 count: 1,
                 range: 1000..=1003,
-            })
+            }))
         );
     }
 
     #[test]
     // Tests percentiles
     fn percentiles() {
         let histogram = AtomicHistogram::new(7, 64).unwrap();
+        let percentiles = [25.0, 50.0, 75.0, 90.0, 99.0];
+
+        // check empty
+        assert_eq!(histogram.load().percentiles(&percentiles), Ok(None));
+
+        for percentile in percentiles {
+            assert_eq!(histogram.load().percentile(percentile), Ok(None));
+        }
+
+        // populate and check percentiles
         for i in 0..=100 {
             let _ = histogram.increment(i);
             assert_eq!(
                 histogram.load().percentile(0.0),
-                Ok(Bucket {
+                Ok(Some(Bucket {
                     count: 1,
                     range: 0..=0,
-                })
+                }))
             );
             assert_eq!(
                 histogram.load().percentile(100.0),
-                Ok(Bucket {
+                Ok(Some(Bucket {
                     count: 1,
                     range: i..=i,
-                })
+                }))
+            );
+        }
+
+        for percentile in percentiles {
+            assert_eq!(
+                histogram
+                    .load()
+                    .percentile(percentile)
+                    .map(|b| b.unwrap().end()),
+                Ok(percentile as u64)
             );
         }
-        assert_eq!(histogram.load().percentile(25.0).map(|b| b.end()), Ok(25));
-        assert_eq!(histogram.load().percentile(50.0).map(|b| b.end()), Ok(50));
-        assert_eq!(histogram.load().percentile(75.0).map(|b| b.end()), Ok(75));
-        assert_eq!(histogram.load().percentile(90.0).map(|b| b.end()), Ok(90));
-        assert_eq!(histogram.load().percentile(99.0).map(|b| b.end()), Ok(99));
-        assert_eq!(histogram.load().percentile(99.9).map(|b| b.end()), Ok(100));
+
+        assert_eq!(
+            histogram.load().percentile(99.9).map(|b| b.unwrap().end()),
+            Ok(100)
+        );
 
         assert_eq!(
             histogram.load().percentile(-1.0),
@@ -155,6 +174,7 @@ mod test {
             .load()
             .percentiles(&[50.0, 90.0, 99.0, 99.9])
             .unwrap()
+            .unwrap()
             .iter()
             .map(|(p, b)| (*p, b.end()))
             .collect();
@@ -167,10 +187,10 @@ mod test {
         let _ = histogram.increment(1024);
         assert_eq!(
             histogram.load().percentile(99.9),
-            Ok(Bucket {
+            Ok(Some(Bucket {
                 count: 1,
                 range: 1024..=1031,
-            })
+            }))
         );
     }
 }
diff --git a/histogram/src/sparse.rs b/histogram/src/sparse.rs
@@ -142,35 +142,67 @@ impl SparseHistogram {
         Ok(histogram)
     }
 
-    /// Return a single percentile from this histogram.
+    /// Return a collection of percentiles from this histogram.
     ///
-    /// The percentile should be in the inclusive range `0.0..=100.0`. For
+    /// Each percentile should be in the inclusive range `0.0..=100.0`. For
     /// example, the 50th percentile (median) can be found using `50.0`.
-    pub fn percentile(&self, percentile: f64) -> Result<Bucket, Error> {
+    ///
+    /// The results will be sorted by the percentile.
+    pub fn percentiles(&self, percentiles: &[f64]) -> Result<Option<Vec<(f64, Bucket)>>, Error> {
+        // validate all the percentiles
+        if percentiles.is_empty() {
+            return Err(Error::InvalidPercentile);
+        }
+
+        for percentile in percentiles {
+            if !(0.0..=100.0).contains(percentile) {
+                return Err(Error::InvalidPercentile);
+            }
+        }
+
         let total: u128 = self.count.iter().map(|v| *v as u128).sum();
 
-        if !(0.0..=100.0).contains(&percentile) {
-            return Err(Error::InvalidPercentile);
+        // empty histogram, no percentiles available
+        if total == 0 {
+            return Ok(None);
         }
 
-        let search = ((total as f64) * percentile / 100.0).ceil() as usize;
+        // sort the requested percentiles so we can find them in a single pass
+        let mut percentiles = percentiles.to_vec();
+        percentiles.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+        let searches: Vec<usize> = percentiles
+            .iter()
+            .map(|p| ((total as f64) * *p / 100.0).ceil() as usize)
+            .collect();
+        let mut search_idx = 0;
+        let mut result: Vec<(f64, Bucket)> = Vec::with_capacity(percentiles.len());
+
         let mut seen: usize = 0;
         for (idx, count) in self.index.iter().zip(self.count.iter()) {
             seen += *count as usize;
-            if seen >= search {
-                return Ok(Bucket {
-                    count: *count,
-                    range: self.config.index_to_range(*idx),
-                });
+            while search_idx < searches.len() && seen >= searches[search_idx] {
+                result.push((
+                    percentiles[search_idx],
+                    Bucket {
+                        count: *count,
+                        range: self.config.index_to_range(*idx),
+                    },
+                ));
+                search_idx += 1;
             }
         }
 
-        // should never be reached; return highest bucket if not found
-        let last_idx = self.index.len() - 1;
-        Ok(Bucket {
-            count: self.count[last_idx],
-            range: self.config.index_to_range(self.index[last_idx]),
-        })
+        Ok(Some(result))
+    }
+
+    /// Return a single percentile from this histogram.
+    ///
+    /// The percentile should be in the inclusive range `0.0..=100.0`. For
+    /// example, the 50th percentile (median) can be found using `50.0`.
+    pub fn percentile(&self, percentile: f64) -> Result<Option<Bucket>, Error> {
+        self.percentiles(&[percentile])
+            .map(|v| v.map(|x| x.first().unwrap().1.clone()))
     }
 
     /// Returns a new histogram with a reduced grouping power. The reduced
@@ -384,20 +416,30 @@ mod tests {
     }
 
     #[test]
-    fn percentile() {
+    fn percentiles() {
         let mut hstandard = Histogram::new(4, 10).unwrap();
+        let hempty = SparseHistogram::from(&hstandard);
+
         for v in 1..1024 {
             let _ = hstandard.increment(v);
         }
 
         let hsparse = SparseHistogram::from(&hstandard);
-
-        for percentile in [1.0, 10.0, 25.0, 50.0, 75.0, 90.0, 99.0, 99.9] {
+        let percentiles = [1.0, 10.0, 25.0, 50.0, 75.0, 90.0, 99.0, 99.9];
+        for percentile in percentiles {
+            let bempty = hempty.percentile(percentile).unwrap();
             let bstandard = hstandard.percentile(percentile).unwrap();
             let bsparse = hsparse.percentile(percentile).unwrap();
 
+            assert_eq!(bempty, None);
             assert_eq!(bsparse, bstandard);
         }
+
+        assert_eq!(hempty.percentiles(&percentiles), Ok(None));
+        assert_eq!(
+            hstandard.percentiles(&percentiles).unwrap(),
+            hsparse.percentiles(&percentiles).unwrap()
+        );
     }
 
     fn compare_histograms(hstandard: &Histogram, hsparse: &SparseHistogram) {

diff --git a/histogram/src/standard.rs b/histogram/src/standard.rs
@@ -76,7 +76,7 @@ impl Histogram {
     /// example, the 50th percentile (median) can be found using `50.0`.
     ///
     /// The results will be sorted by the percentile.
-    pub fn percentiles(&self, percentiles: &[f64]) -> Result<Vec<(f64, Bucket)>, Error> {
+    pub fn percentiles(&self, percentiles: &[f64]) -> Result<Option<Vec<(f64, Bucket)>>, Error> {
         // get the total count
         let total_count: u128 = self.buckets.iter().map(|v| *v as u128).sum();
 
@@ -91,6 +91,11 @@ impl Histogram {
             }
         }
 
+        // empty histogram, no percentiles available
+        if total_count == 0 {
+            return Ok(None);
+        }
+
         let mut bucket_idx = 0;
         let mut partial_sum = self.buckets[bucket_idx] as u128;
 
@@ -125,16 +130,16 @@ impl Histogram {
             })
             .collect();
 
-        Ok(result)
+        Ok(Some(result))
     }
 
     /// Return a single percentile from this histogram.
     ///
     /// The percentile should be in the inclusive range `0.0..=100.0`. For
     /// example, the 50th percentile (median) can be found using `50.0`.
-    pub fn percentile(&self, percentile: f64) -> Result<Bucket, Error> {
+    pub fn percentile(&self, percentile: f64) -> Result<Option<Bucket>, Error> {
         self.percentiles(&[percentile])
-            .map(|v| v.first().unwrap().1.clone())
+            .map(|v| v.map(|x| x.first().unwrap().1.clone()))
     }
 
     /// Returns a new histogram with a reduced grouping power. The reduced
@@ -310,36 +315,47 @@ mod tests {
     // Tests percentiles
     fn percentiles() {
         let mut histogram = Histogram::new(7, 64).unwrap();
+
+        assert_eq!(histogram.percentile(50.0).unwrap(), None);
+        assert_eq!(
+            histogram.percentiles(&[50.0, 90.0, 99.0, 99.9]).unwrap(),
+            None
+        );
+
         for i in 0..=100 {
             let _ = histogram.increment(i);
             assert_eq!(
                 histogram.percentile(0.0),
-                Ok(Bucket {
+                Ok(Some(Bucket {
                     count: 1,
                     range: 0..=0,
-                })
+                }))
             );
             assert_eq!(
                 histogram.percentile(100.0),
-                Ok(Bucket {
+                Ok(Some(Bucket {
                     count: 1,
                     range: i..=i,
-                })
+                }))
             );
         }
-        assert_eq!(histogram.percentile(25.0).map(|b| b.end()), Ok(25));
-        assert_eq!(histogram.percentile(50.0).map(|b| b.end()), Ok(50));
-        assert_eq!(histogram.percentile(75.0).map(|b| b.end()), Ok(75));
-        assert_eq!(histogram.percentile(90.0).map(|b| b.end()), Ok(90));
-        assert_eq!(histogram.percentile(99.0).map(|b| b.end()), Ok(99));
-        assert_eq!(histogram.percentile(99.9).map(|b| b.end()), Ok(100));
+        assert_eq!(histogram.percentile(25.0).map(|b| b.unwrap().end()), Ok(25));
+        assert_eq!(histogram.percentile(50.0).map(|b| b.unwrap().end()), Ok(50));
+        assert_eq!(histogram.percentile(75.0).map(|b| b.unwrap().end()), Ok(75));
+        assert_eq!(histogram.percentile(90.0).map(|b| b.unwrap().end()), Ok(90));
+        assert_eq!(histogram.percentile(99.0).map(|b| b.unwrap().end()), Ok(99));
+        assert_eq!(
+            histogram.percentile(99.9).map(|b| b.unwrap().end()),
+            Ok(100)
+        );
 
         assert_eq!(histogram.percentile(-1.0), Err(Error::InvalidPercentile));
         assert_eq!(histogram.percentile(101.0), Err(Error::InvalidPercentile));
 
         let percentiles: Vec<(f64, u64)> = histogram
             .percentiles(&[50.0, 90.0, 99.0, 99.9])
             .unwrap()
+            .unwrap()
             .iter()
             .map(|(p, b)| (*p, b.end()))
             .collect();
@@ -352,10 +368,10 @@ mod tests {
         let _ = histogram.increment(1024);
         assert_eq!(
             histogram.percentile(99.9),
-            Ok(Bucket {
+            Ok(Some(Bucket {
                 count: 1,
                 range: 1024..=1031,
-            })
+            }))
         );
     }
 
@@ -395,7 +411,7 @@ mod tests {
                 let v = vals[((*p / 100.0 * (vals.len() as f64)) as usize) - 1];
 
                 // Value and relative error from full histogram
-                let vhist = histogram.percentile(*p).unwrap().end();
+                let vhist = histogram.percentile(*p).unwrap().unwrap().end();
                 let e = (v.abs_diff(vhist) as f64) * 100.0 / (v as f64);
                 assert!(e < error);
             }