Merge branch 'master' into release

staircase-dev · Jan 29, 2022 · d8b9b44 · d8b9b44
2 parents 43f44e4 + 4e62fd3
commit d8b9b44
Show file tree

Hide file tree

Showing 8 changed files with 868 additions and 579 deletions.
diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst
@@ -5,6 +5,14 @@ Release notes
 ========================
 
 
+**v0.8.0 2022-01-29**
+
+- Added `bins` parameter to :func:`piso.coverage` and :meth:`ArrayAccessor.coverage() <piso.accessor.ArrayAccessor.coverage>`
+- Added `how` parameter to :func:`piso.coverage` and :meth:`ArrayAccessor.coverage() <piso.accessor.ArrayAccessor.coverage>`
+- Added `result` parameter to :func:`piso.contains` and :meth:`ArrayAccessor.contains() <piso.accessor.ArrayAccessor.contains>`
+- Added `how` parameter to :func:`piso.contains` and :meth:`ArrayAccessor.contains() <piso.accessor.ArrayAccessor.contains>`
+
+
 **v0.7.0 2021-11-20**
 
 Added the following methods
@@ -28,6 +36,7 @@ The following methods were extended to accommodate intervals with *closed = "bot
 - :func:`piso.lookup`
 - :func:`piso.isdisjoint` (and :meth:`ArrayAccessor.isdisjoint() <piso.accessor.ArrayAccessor.isdisjoint>`)
 
+
 **v0.5.0 2021-11-02**
 
 Added the following methods

diff --git a/piso/accessor.py b/piso/accessor.py
@@ -142,10 +142,12 @@ def issubset(self, *interval_arrays, squeeze=False):
         )
 
     @Appender(docstrings.coverage_docstring, join="\n", indents=1)
-    def coverage(self, domain=None):
+    def coverage(self, domain=None, bins=False, how="fraction"):
         return intervalarray.coverage(
             self._interval_array,
             domain,
+            bins,
+            how,
         )
 
     @Appender(docstrings.complement_docstring, join="\n", indents=1)
@@ -156,11 +158,13 @@ def complement(self, domain=None):
         )
 
     @Appender(docstrings.contains_docstring, join="\n", indents=1)
-    def contains(self, x, include_index=True):
+    def contains(self, x, include_index=True, result="cartesian", how="any"):
         return intervalarray.contains(
             self._interval_array,
             x,
             include_index,
+            result,
+            how,
         )
 
     @Appender(docstrings.split_docstring, join="\n", indents=1)

diff --git a/piso/docstrings/accessor.py b/piso/docstrings/accessor.py
@@ -591,11 +591,13 @@ def join_params(list_of_param_strings):
 
 
 coverage_docstring = """
-Calculates the fraction of a domain covered by a collection of intervals.
+Calculates the size of a domain (or possibly multiple domains) covered by a collection of intervals.
 
 The intervals are contained in the array object the accessor belongs to.
 The (possibly overlapping) intervals may not, or partially, or wholly cover the domain.
 
+Calculation over multiple domains is only possible when *bins* = True.
+
 Parameters
 ----------
 domain : :py:class:`tuple`, :class:`pandas.Interval`, :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`, optional
@@ -604,11 +606,19 @@ def join_params(list_of_param_strings):
     If *domain* is a tuple then it should specify lower and upper bounds, and be equivalent to a
     :class:`pandas.Interval`.  If *domain* is a :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`
     then the intervals it contains define a possibly disconnected domain.
+    If *bins* = True then *domain* must be :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray` with disjoint intervals.
+bins : boolean, default False
+    If False, then the *domain* is interpreted as a single domain and returns one value.
+    If True, then *domain* is interpreted as disjoint bins over which coverage is calculated for each.
+how : {"fraction", "sum"}, default "fraction"
+    If *how* = "fraction" then the result is a fraction of the size of the domain.
+    If *how* = "sum" then the result is the length of the domain covered.
+
+    .. versionadded:: 0.8.0
 
 Returns
 ----------
-float
-    a number between 0 and 1, representing the fraction of the domain covered.
+float or :class:`pandas.Series`
 
 Examples
 -----------
@@ -630,11 +640,24 @@ def join_params(list_of_param_strings):
 >>> arr1.piso.coverage(pd.Interval(-10, 10))
 0.3
 
+>>> arr1.piso.coverage(pd.Interval(-10, 10), how="sum")
+6
+
 >>> domain = pd.arrays.IntervalArray.from_tuples(
-...     [(4,6), (7, 9)],
+...     [(4,6), (7, 10)],
 ... )
 >>> arr1.piso.coverage(domain)
-0.5
+0.4
+
+>>> arr1.piso.coverage(domain, bins=True)
+(4, 6]     0.500000
+(7, 10]    0.333333
+dtype: float64
+
+>>> arr1.piso.coverage(domain, bins=True, how="sum")
+(4, 6]     1.0
+(7, 10]    1.0
+dtype: float64
 """
 
 complement_docstring = """
@@ -736,25 +759,39 @@ def join_params(list_of_param_strings):
 
 
 contains_docstring = """
-Check pair-wise if a set of intervals, belonging to the object the accessor belongs to, contains a set of values.
+Evaluates the intersection of a set of intervals with a set of points.
 
-Returns a 2-dimensional boolean mask *M* of shape *(m,n)* where *m* is the number of intervals, and
-*n* is the number of points.  The element in the i-th row and j-th column is True if
+The format of the result is dependent on the *result* parameter.  If *result = "cartesian"* then the
+the function returns a 2-dimensional boolean mask *M* of shape *(m,n)* where *m* is the number of
+intervals, and *n* is the number of points.  The element in the i-th row and j-th column is True if
 the i-th interval contains the j-th point.
 
+If *result = "points"* then the result is a 1-dimensional boolean mask of length *n*.
+If *result = "intervals"* then the result is a 1-dimensional boolean mask of length *m*.
+
 Parameters
 ----------
+interval_array : :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`
+    Contains the intervals.  May be left-closed, right-closed, both, or neither.
 x : scalar, or array-like of scalars
-    Values in *x* should belong to the same domain as the intervals in *interval_array*.
-    May be left-closed, right-closed, both, or neither.
+    Values in *x* should belong to the same domain as the intervals contained by the object the
+    accessor belongs to.
 include_index : boolean, default True
     Indicates whether to return a :class:`numpy.ndarray` or :class:`pandas.DataFrame` indexed
     by *interval_array* and column names equal to *x*
+result : {"cartesian", "points", "intervals"}, default "cartesian"
+    If *result* = "cartesian" then the result will be two dimensional, otherwise it will be
+    one dimensional.
+how : {"any", "all"}, default "any"
+    Only relevant if *result* is not "cartesian".  This parameter indicates either:
+    - a True value means any or all points are contained within an interval, or
+    - a True value means any or all intervals contained a point.
+    Which of these interpretations is dependent on the *result* parameter.
 
 Returns
 ----------
-:class:`numpy.ndarray` or :class:`pandas.DataFrame`
-    Two dimensional and boolean valued.  Return type dependent on *include_index*.
+:class:`numpy.ndarray`, :class:`pandas.DataFrame` or :class:`pandas.Series`
+    One, or two, dimensional and boolean valued.  Return type dependent on *include_index* and *result*.
 
 Examples
 -----------
@@ -781,6 +818,25 @@ def join_params(list_of_param_strings):
 array([[False,  True,  True,  True],
        [False, False,  True,  True]])
 
+>>> arr.piso.contains([0, 1, 3, 4], result="points")
+0    False
+1     True
+3     True
+4     True
+dtype: bool
+
+>>> arr.piso.contains([0, 1, 3, 4], result="points", how="all")
+0    False
+1    False
+3     True
+4     True
+dtype: bool
+
+>>> arr.piso.contains([0, 1, 3, 4], result="intervals")
+(0, 4]    True
+(2, 5]    True
+dtype: bool
+
 >>> pd.IntervalIndex.from_tuples([(0,2)]).piso.contains(1, include_index=False)
 array([[ True]])
 """

diff --git a/piso/docstrings/intervalarray.py b/piso/docstrings/intervalarray.py
@@ -596,7 +596,9 @@ def join_params(list_of_param_strings):
 
 
 coverage_docstring = """
-Calculates the fraction of a domain covered by a collection of intervals.
+Calculates the fraction of a domain (or possibly multiple domains) covered by a collection of intervals.
+
+Calculation over multiple domains is only possible when *bins* = True.
 
 Parameters
 ----------
@@ -609,11 +611,19 @@ def join_params(list_of_param_strings):
     If *domain* is a tuple then it should specify lower and upper bounds, and be equivalent to a
     :class:`pandas.Interval`.  If *domain* is a :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`
     then the intervals it contains define a possibly disconnected domain.
+    If *bins* = True then *domain* must be :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray` with disjoint intervals.
+bins : boolean, default False
+    If False, then the *domain* is interpreted as a single domain and returns one value.
+    If True, then *domain* is interpreted as disjoint bins over which coverage is calculated for each.
+how : {"fraction", "sum"}, default "fraction"
+    If *how* = "fraction" then the result is a fraction of the size of the domain.
+    If *how* = "sum" then the result is the length of the domain covered.
+
+    .. versionadded:: 0.8.0
 
 Returns
 ----------
-float
-    a number between 0 and 1, representing the fraction of the domain covered.
+float or :class:`pandas.Series`
 
 Examples
 -----------
@@ -634,11 +644,24 @@ def join_params(list_of_param_strings):
 >>> piso.coverage(arr1, pd.Interval(-10, 10))
 0.3
 
+>>> piso.coverage(arr1, pd.Interval(-10, 10), how="sum")
+6
+
 >>> domain = pd.arrays.IntervalArray.from_tuples(
-...     [(4,6), (7, 9)],
+...     [(4,6), (7, 10)],
 ... )
 >>> piso.coverage(arr1, domain)
-0.5
+0.4
+
+>>> piso.coverage(arr1, domain, bins=True)
+(4, 6]     0.500000
+(7, 10]    0.333333
+dtype: float64
+
+>>> piso.coverage(arr1, domain, bins=True, how="sum")
+(4, 6]     1.0
+(7, 10]    1.0
+dtype: float64
 """
 
 
@@ -743,12 +766,16 @@ def join_params(list_of_param_strings):
 
 
 contains_docstring = """
-Check pair-wise if a set of intervals contains a set of values
+Evaluates the intersection of a set of intervals with a set of points.
 
-Returns a 2-dimensional boolean mask *M* of shape *(m,n)* where *m* is the number of intervals, and
-*n* is the number of points.  The element in the i-th row and j-th column is True if
+The format of the result is dependent on the *result* parameter.  If *result = "cartesian"* then the
+the function returns a 2-dimensional boolean mask *M* of shape *(m,n)* where *m* is the number of
+intervals, and *n* is the number of points.  The element in the i-th row and j-th column is True if
 the i-th interval contains the j-th point.
 
+If *result = "points"* then the result is a 1-dimensional boolean mask of length *n*.
+If *result = "intervals"* then the result is a 1-dimensional boolean mask of length *m*.
+
 Parameters
 ----------
 interval_array : :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`
@@ -758,11 +785,20 @@ def join_params(list_of_param_strings):
 include_index : boolean, default True
     Indicates whether to return a :class:`numpy.ndarray` or :class:`pandas.DataFrame` indexed
     by *interval_array* and column names equal to *x*
+result : {"cartesian", "points", "intervals"}, default "cartesian"
+    If *result* = "cartesian" then the result will be two dimensional, otherwise it will be
+    one dimensional.
+how : {"any", "all"}, default "any"
+    Only relevant if *result* is not "cartesian".  This parameter indicates either:
+    - a True value means any or all points are contained within an interval, or
+    - a True value means any or all intervals contained a point.
+    Which of these interpretations is dependent on the *result* parameter.
+
 
 Returns
 ----------
-:class:`numpy.ndarray` or :class:`pandas.DataFrame`
-    Two dimensional and boolean valued.  Return type dependent on *include_index*.
+:class:`numpy.ndarray`, :class:`pandas.DataFrame` or :class:`pandas.Series`
+    One, or two, dimensional and boolean valued.  Return type dependent on *include_index* and *result*.
 
 Examples
 -----------
@@ -788,6 +824,25 @@ def join_params(list_of_param_strings):
 array([[False,  True,  True,  True],
        [False, False,  True,  True]])
 
+>>> piso.contains(arr, [0, 1, 3, 4], result="points")
+0    False
+1     True
+3     True
+4     True
+dtype: bool
+
+>>> piso.contains(arr, [0, 1, 3, 4], result="points", how="all")
+0    False
+1    False
+3     True
+4     True
+dtype: bool
+
+>>> piso.contains(arr, [0, 1, 3, 4], result="intervals")
+(0, 4]    True
+(2, 5]    True
+dtype: bool
+
 >>> piso.contains(pd.IntervalIndex.from_tuples([(0,2)]), 1, include_index=False)
 array([[ True]])
 """

diff --git a/piso/intervalarray.py b/piso/intervalarray.py
@@ -178,15 +178,30 @@ def _get_domain_tuple(interval_array, domain):
 
 
 @Appender(docstrings.coverage_docstring, join="\n", indents=1)
-def coverage(interval_array, domain=None):
+def coverage(interval_array, domain=None, bins=False, how="fraction"):
+    assert how in ("fraction", "sum")
+
+    def _validate_domain():
+        if not isinstance(domain, (pd.IntervalIndex, pd.arrays.IntervalArray)):
+            raise ValueError(
+                "If bins argument is true then domain parameter must be a pandas IntervalIndex or IntervalArray."
+            )
+        if not isdisjoint(domain):
+            raise ValueError(
+                "If bins argument is true then domain parameter must represent disjoint intervals."
+            )
+
     stepfunction = _interval_x_to_stairs(interval_array).make_boolean()
-    if isinstance(domain, (pd.IntervalIndex, pd.arrays.IntervalArray)):
+    if bins:
+        _validate_domain()
+        adjusted_domain = stepfunction.slice(pd.IntervalIndex(domain))
+    elif isinstance(domain, (pd.IntervalIndex, pd.arrays.IntervalArray)):
         domain = _interval_x_to_stairs(domain)
-        result = stepfunction.where(domain).mean()
+        adjusted_domain = stepfunction.where(domain)
     else:
         domain = _get_domain_tuple(interval_array, domain)
-        result = stepfunction.clip(*domain).mean()
-    return result
+        adjusted_domain = stepfunction.clip(*domain)
+    return adjusted_domain.mean() if how == "fraction" else adjusted_domain.integral()
 
 
 @Appender(docstrings.complement_docstring, join="\n", indents=1)
@@ -203,7 +218,9 @@ def complement(interval_array, domain=None):
 
 
 @Appender(docstrings.contains_docstring, join="\n", indents=1)
-def contains(interval_array, x, include_index=True):
+def contains(interval_array, x, include_index=True, result="cartesian", how="any"):
+    assert result in ("cartesian", "intervals", "points")
+    assert how in ("any", "all")
     starts = interval_array.left.values
     ends = interval_array.right.values
     x = pd.Series(x).values
@@ -213,10 +230,18 @@ def contains(interval_array, x, include_index=True):
     left_compare = (
         np.greater_equal if interval_array.closed in ("left", "both") else np.greater
     )
-    result = (right_compare.outer(x, ends) & left_compare.outer(x, starts)).transpose()
+    calc = (right_compare.outer(x, ends) & left_compare.outer(x, starts)).transpose()
+    if result != "cartesian":
+        logical_method = np.logical_or if how == "any" else np.logical_and
+        axis = 0 if result == "points" else 1
+        calc = logical_method.reduce(calc, axis=axis)
     if include_index:
-        return pd.DataFrame(result, index=interval_array, columns=x)
-    return result
+        if result == "cartesian":
+            calc = pd.DataFrame(calc, index=interval_array, columns=x)
+        else:
+            index = x if result == "points" else interval_array
+            calc = pd.Series(calc, index=index)
+    return calc
 
 
 @Appender(docstrings.split_docstring, join="\n", indents=1)