From bf0416ab3a7bd4bc4594915e3e6f27bb06fe05b4 Mon Sep 17 00:00:00 2001
From: smit kadvani <smit.kadvani@gmail.com>
Date: Fri, 19 Jan 2024 13:55:13 -0800
Subject: [PATCH] core.checks functions now support custom cols, df_view_cols,
 cols_view (#182)

* cols, df_view_col, and cols_view arguments now passed to downstream functions in core.checks, enabling user-defined column arguments for: `is_covering`, `is_tiling`, `is_sorted`, `is_contained`
* positive & negative test case covering these arguments added for: `is_covering`, `is_tiling`, `is_sorted`, `is_contained`
closes #126
---
 bioframe/core/checks.py   | 147 +++++++++++++++++++++-----------
 tests/test_core_checks.py | 175 +++++++++++++++++++++++++++++++++-----
 2 files changed, 249 insertions(+), 73 deletions(-)

diff --git a/bioframe/core/checks.py b/bioframe/core/checks.py
index 7b0aced..8256eca 100644
--- a/bioframe/core/checks.py
+++ b/bioframe/core/checks.py
@@ -29,7 +29,8 @@ def is_bedframe(
 
     - chrom, start, end columns
     - columns have valid dtypes
-    - for each interval, if any of chrom, start, end are null, then all are null
+    - for each interval, if any of chrom, start, end are null, then all are
+        null
     - all starts < ends.
 
     Parameters
@@ -61,7 +62,8 @@ def is_bedframe(
             raise TypeError("Invalid bedFrame: Invalid column names")
         return False
 
-    if not _verify_column_dtypes(df, cols=[ck1, sk1, ek1], return_as_bool=True):
+    if not _verify_column_dtypes(df, cols=[ck1, sk1, ek1],
+                                 return_as_bool=True):
         if raise_errors:
             raise TypeError("Invalid bedFrame: Invalid column dtypes")
         return False
@@ -77,18 +79,18 @@ def is_bedframe(
 
     if ((df[ek1] - df[sk1]) < 0).any():
         if raise_errors:
-            raise ValueError(
-                f"Invalid bedframe: starts exceed ends for "
-                f"{sum((df[ek1] - df[sk1]) < 0)} intervals"
-            )
+            raise ValueError(f"Invalid bedframe: starts exceed ends for "
+                             f"{sum((df[ek1] - df[sk1]) < 0)} intervals")
         return False
 
     return True
 
 
-def is_cataloged(
-    df, view_df, raise_errors=False, df_view_col="view_region", view_name_col="name"
-):
+def is_cataloged(df,
+                 view_df,
+                 raise_errors=False,
+                 df_view_col="view_region",
+                 view_name_col="name"):
     """
     Tests if all region names in `df[df_view_col]` are present in
     `view_df[view_name_col]`.
@@ -125,20 +127,19 @@ def is_cataloged(
 
     if not _verify_columns(view_df, [view_name_col], return_as_bool=True):
         if raise_errors:
-            raise ValueError(f"Could not find `{view_name_col}` column in view_df")
+            raise ValueError(f"Could not find \
+                `{view_name_col}` \
+                column in view_df")
         return False
 
     if not set(df[df_view_col].copy().dropna().values).issubset(
-        set(view_df[view_name_col].values)
-    ):
+            set(view_df[view_name_col].values)):
         if raise_errors:
             missing_regions = set(df[df_view_col].values).difference(
-                set(view_df[view_name_col].values)
-            )
+                set(view_df[view_name_col].values))
             raise ValueError(
                 f"The following regions in df[df_view_col] not in "
-                f"view_df[view_name_col]: \n{missing_regions}"
-            )
+                f"view_df[view_name_col]: \n{missing_regions}")
         return False
 
     return True
@@ -171,7 +172,8 @@ def is_overlapping(df, cols=None):
     df_merged = merge(df, cols=cols)
 
     total_interval_len = np.sum((df[ek1] - df[sk1]).values)
-    total_interval_len_merged = np.sum((df_merged[ek1] - df_merged[sk1]).values)
+    total_interval_len_merged = np.sum(
+        (df_merged[ek1] - df_merged[sk1]).values)
 
     if total_interval_len > total_interval_len_merged:
         return True
@@ -179,7 +181,10 @@ def is_overlapping(df, cols=None):
         return False
 
 
-def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None):
+def is_viewframe(region_df,
+                 raise_errors=False,
+                 view_name_col="name",
+                 cols=None):
     """
     Checks that `region_df` is a valid viewFrame.
 
@@ -218,9 +223,8 @@ def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None)
 
     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
 
-    if not _verify_columns(
-        region_df, [ck1, sk1, ek1, view_name_col], return_as_bool=True
-    ):
+    if not _verify_columns(region_df, [ck1, sk1, ek1, view_name_col],
+                           return_as_bool=True):
         if raise_errors:
             raise TypeError("Invalid view: invalid column names")
         return False
@@ -235,11 +239,11 @@ def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None)
             raise ValueError("Invalid view: cannot contain NAs")
         return False
 
-    if len(set(region_df[view_name_col])) < len(region_df[view_name_col].values):
+    if len(set(region_df[view_name_col])) < \
+       len(region_df[view_name_col].values):
         if raise_errors:
-            raise ValueError(
-                "Invalid view: entries in region_df[view_name_col] must be unique"
-            )
+            raise ValueError("Invalid view: entries in \
+                region_df[view_name_col] must be unique")
         return False
 
     if is_overlapping(region_df, cols=cols):
@@ -257,6 +261,7 @@ def is_contained(
     df_view_col=None,
     view_name_col="name",
     cols=None,
+    cols_view=None,
 ):
     """
     Tests if all genomic intervals in a bioframe `df` are cataloged and do not
@@ -277,8 +282,13 @@ def is_contained(
         Column from df used to associate interviews with view regions.
         Default `view_region`.
 
+    view_name_col:
+        Column from view_df with view region names. Default `name`.
+
     cols: (str, str, str)
         Column names for chrom, start, end in df.
+    cols_view: (str, str, str)
+        Column names for chrom, start, end in view_df.
 
     Returns
     -------
@@ -286,16 +296,22 @@ def is_contained(
 
     """
     from ..ops import trim
-
     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
-
+    ck2, sk2, ek2 = _get_default_colnames() if cols_view is None else cols_view
     if df_view_col is None:
         try:
-            df_view_assigned = ops.overlap(df, view_df)
-            assert (df_view_assigned["end_"].isna()).sum() == 0
-            assert (df_view_assigned["start_"].isna()).sum() == 0
-            assert (df_view_assigned["end"] <= df_view_assigned["end_"]).all()
-            assert (df_view_assigned["start"] >= df_view_assigned["start_"]).all()
+            df_view_assigned = ops.overlap(df,
+                                           view_df,
+                                           cols1=cols,
+                                           cols2=cols_view)
+            # ek2 = end_ is the default value
+            assert (df_view_assigned[ek2 + "_"].isna()).sum() == 0
+            # sk2 = start_ is the default value
+            assert (df_view_assigned[sk2 + "_"].isna()).sum() == 0
+            assert (df_view_assigned[ek1] <= df_view_assigned[ek2 + "_"]).all()
+            # ek1 = end is the default value
+            # sk1 = start is the default value
+            assert (df_view_assigned[sk1] >= df_view_assigned[sk2 + "_"]).all()
         except AssertionError:
             if raise_errors:
                 raise AssertionError("df not contained in view_df")
@@ -304,15 +320,18 @@ def is_contained(
         return True
 
     if not is_cataloged(
-        df, view_df, df_view_col=df_view_col, view_name_col=view_name_col
-    ):
+            df, view_df, df_view_col=df_view_col, view_name_col=view_name_col):
         if raise_errors:
             raise ValueError("df not cataloged in view_df")
         return False
 
-    df_trim = trim(
-        df, view_df=view_df, df_view_col=df_view_col, view_name_col=view_name_col
-    )
+    df_trim = trim(df,
+                   view_df=view_df,
+                   df_view_col=df_view_col,
+                   view_name_col=view_name_col,
+                   cols=cols,
+                   cols_view=cols_view)
+
     is_start_trimmed = np.any(df[sk1].values != df_trim[sk1].values)
     is_end_trimmed = np.any(df[ek1].values != df_trim[ek1].values)
 
@@ -324,7 +343,7 @@ def is_contained(
         return True
 
 
-def is_covering(df, view_df, view_name_col="name", cols=None):
+def is_covering(df, view_df, view_name_col="name", cols=None, cols_view=None):
     """
     Tests if a view `view_df` is covered by the set of genomic intervals in
     the bedframe `df`.
@@ -348,6 +367,12 @@ def is_covering(df, view_df, view_name_col="name", cols=None):
         genomic intervals, provided separately for each set. The default
         values are 'chrom', 'start', 'end'.
 
+    cols_view: (str, str, str) or None
+        The names of columns containing the chromosome, start and end of the
+        genomic intervals in view_df, provided separately for
+        each set. The default
+        values are 'chrom', 'start', 'end'.
+
     Returns
     -------
     is_covering:bool
@@ -356,10 +381,11 @@ def is_covering(df, view_df, view_name_col="name", cols=None):
     from ..ops import complement
 
     if complement(
-        df,
-        view_df=view_df,
-        view_name_col=view_name_col,
-        cols=cols,
+            df,
+            view_df=view_df,
+            view_name_col=view_name_col,
+            cols=cols,
+            cols_view=cols_view,
     ).empty:
         return True
     else:
@@ -373,6 +399,7 @@ def is_tiling(
     df_view_col="view_region",
     view_name_col="name",
     cols=None,
+    cols_view=None,
 ):
     """
     Tests if a view `view_df` is tiled by the set of genomic intervals in the
@@ -405,6 +432,11 @@ def is_tiling(
         The names of columns containing the chromosome, start and end of the
         genomic intervals, provided separately for each set. The default
         values are 'chrom', 'start', 'end'.
+    cols_view: (str, str, str) or None
+        The names of columns containing the chromosome, start and end of the
+        genomic intervals in view_df, provided
+        separately for each set. The default
+        values are 'chrom', 'start', 'end'.
 
     Returns
     -------
@@ -412,21 +444,28 @@ def is_tiling(
 
     """
 
-    view_df = construction.make_viewframe(
-        view_df, view_name_col=view_name_col, cols=cols
-    )
+    view_df = construction.make_viewframe(view_df,
+                                          view_name_col=view_name_col,
+                                          cols=cols_view)
 
-    if is_overlapping(df):
+    if is_overlapping(df, cols=cols):
         if raise_errors:
             raise ValueError("overlaps")
         return False
-    if not is_covering(df, view_df, view_name_col=view_name_col, cols=None):
+    if not is_covering(df,
+                       view_df,
+                       view_name_col=view_name_col,
+                       cols=cols,
+                       cols_view=cols_view):
         if raise_errors:
             raise ValueError("not covered")
         return False
-    if not is_contained(
-        df, view_df, df_view_col=df_view_col, view_name_col=view_name_col, cols=None
-    ):
+    if not is_contained(df,
+                        view_df,
+                        df_view_col=df_view_col,
+                        view_name_col=view_name_col,
+                        cols=cols,
+                        cols_view=cols_view):
         if raise_errors:
             raise ValueError("not contained")
         return False
@@ -440,6 +479,7 @@ def is_sorted(
     df_view_col=None,
     view_name_col="name",
     cols=None,
+    cols_view=None,
 ):
     """
     Tests if a bedframe is changed by sorting.
@@ -472,6 +512,12 @@ def is_sorted(
         genomic intervals, provided separately for each set. The default
         values are 'chrom', 'start', 'end'.
 
+    cols_view: (str, str, str) or None
+        The names of columns containing the chromosome, start and end of the
+        genomic intervals in view_df, provided separately for each set.
+        The default
+        values are 'chrom', 'start', 'end'.
+
     Returns
     -------
     is_sorted : bool
@@ -486,6 +532,7 @@ def is_sorted(
         df_view_col=df_view_col,
         view_name_col=view_name_col,
         cols=cols,
+        cols_view=cols_view,
     )
 
     if df.equals(df_sorted):
diff --git a/tests/test_core_checks.py b/tests/test_core_checks.py
index 03ce543..7f7ace4 100644
--- a/tests/test_core_checks.py
+++ b/tests/test_core_checks.py
@@ -18,7 +18,7 @@
 
 
 def test_is_cataloged():
-    ### chr2q is not in view
+    #   chr2q is not in view
     view_df = pd.DataFrame(
         [
             ["chr1", 0, 12, "chr1p"],
@@ -38,7 +38,7 @@ def test_is_cataloged():
     )
     assert not is_cataloged(df, view_df)
 
-    ### chr1q is in view, df_view_col and view_name_col have funny labels.
+    #    chr1q is in view, df_view_col and view_name_col have funny labels.
     view_df = pd.DataFrame(
         [
             ["chr1", 0, 12, "chr1p"],
@@ -56,7 +56,10 @@ def test_is_cataloged():
         columns=["chrom", "start", "end", "funny_view_region"],
     )
     assert is_cataloged(
-        df, view_df, df_view_col="funny_view_region", view_name_col="funny_name"
+        df,
+        view_df,
+        df_view_col="funny_view_region",
+        view_name_col="funny_name"
     )
 
 
@@ -71,7 +74,7 @@ def test_is_contained():
         columns=["chrom", "start", "end", "name"],
     )
 
-    ### not contained because chr2q is not cataloged
+    #   not contained because chr2q is not cataloged
     df = pd.DataFrame(
         [
             ["chr1", 0, 12, "chr1p"],
@@ -82,7 +85,7 @@ def test_is_contained():
     )
     assert not is_contained(df, view_df, df_view_col="view_region")
 
-    ### not contained because second interval falls outside the view regions
+    #   not contained because second interval falls outside the view regions
     df = pd.DataFrame(
         [
             ["chr1", 14, 15, "chr1p"],
@@ -104,10 +107,49 @@ def test_is_contained():
     # is contained, because assignments are inferred
     assert is_contained(df, view_df)
 
+    # using previous assignments with non-standard column names
+    # in dataframes without argument
+    view_df = pd.DataFrame(
+        [
+            ["chr1", 0, 12, "chr1p"],
+            ["chr1", 13, 26, "chr1q"],
+            ["chrX", 1, 8, "chrX_0"],
+        ],
+        columns=["CHROM", "START", "END", "NAME"],
+    )
+    df = pd.DataFrame(
+        [
+            ["chr1", 12, 12, "chr1p"],
+            ["chr1", 13, 14, "chr1q"],
+            ["chrX", 1, 8, "chrX_0"],
+        ],
+        columns=["chrom1", "start1", "end1", "VIEW_REGION"],
+    )
+    assert is_contained(
+        df,
+        view_df,
+        cols=["chrom1", "start1", "end1"],
+        cols_view=["CHROM", "START", "END"],
+        df_view_col="VIEW_REGION",
+        view_name_col="NAME"
+        )
+
+    with pytest.raises(TypeError):
+        # cols and view_cols are not passed as an arguments
+        is_contained(df, view_df, raise_errors=True)
+
     # is not contained, because assignments are not inferred
+    view_df = pd.DataFrame(
+        [
+            ["chr1", 0, 12, "chr1p"],
+            ["chr1", 13, 26, "chr1q"],
+            ["chrX", 1, 8, "chrX_0"],
+        ],
+        columns=["chrom", "start", "end", "name"],
+    )
     assert not is_contained(df, view_df, df_view_col="view_region")
 
-    ### second interval falls completely out of the view
+    #   second interval falls completely out of the view
     df = pd.DataFrame(
         [
             ["chr1", 12, 12, "chr1p"],
@@ -125,7 +167,7 @@ def test_is_contained():
 
 
 def test_is_overlapping():
-    ### interval on chr1 overlaps
+    #   interval on chr1 overlaps
     d = """chrom  start  end
          0  chr1      3    6
          1  chr1     5   10
@@ -133,7 +175,7 @@ def test_is_overlapping():
     df = pd.read_csv(StringIO(d), sep=r"\s+")
     assert is_overlapping(df)
 
-    ### adjacent intervals do not overlap
+    #   adjacent intervals do not overlap
     d = """chrom  start  end
          0  chr1    3     6
          1  chr1    6    10
@@ -143,8 +185,8 @@ def test_is_overlapping():
 
 
 def test_is_covering():
-    ### test is_covering where an interval from df completely overlaps
-    ### two different regions from view
+    #   test is_covering where an interval from df completely overlaps
+    #   two different regions from view
     df1 = pd.DataFrame(
         [
             ["chr1", -5, 25],
@@ -154,8 +196,8 @@ def test_is_covering():
     chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")]
     assert is_covering(df1, chromsizes)
 
-    ### test is_covering where two intervals from df overlap
-    ### two different regions from view
+    #   test is_covering where two intervals from df overlap
+    #   two different regions from view
     df1 = pd.DataFrame(
         [
             ["chr1", -5, 10],
@@ -167,8 +209,36 @@ def test_is_covering():
     chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")]
     assert is_covering(df1, chromsizes)
 
-    ### test is_covering where two intervals from df overlap
-    ### two different regions from view
+    #   test is_covering with non-standard columns names
+    df1 = pd.DataFrame(
+        [
+            ["chr1", -5, 10],
+            ["chr1", 11, 12],
+            ["chr1", 12, 20],
+        ],
+        columns=["chrom1", "start1", "end1"],
+    )
+    chromsizes = pd.DataFrame(
+        [
+            ["chr1", 0, 9, "chr1p"],
+            ["chr1", 11, 20, "chr1q"]
+        ],
+        columns=["CHROM", "START", "END", "NAME"],
+        )
+    assert is_covering(
+                        df1,
+                        chromsizes,
+                        cols=["chrom1", "start1", "end1"],
+                        cols_view=["CHROM", "START", "END"],
+                        view_name_col="NAME"
+                    )
+
+    with pytest.raises(ValueError):
+        # cols and view_cols are not passed as an arguments
+        is_covering(df1, chromsizes)
+
+    #   test is_covering where two intervals from df overlap
+    #   two different regions from view
     df1 = pd.DataFrame(
         [
             ["chr1", -5, 10, "chr1q"],
@@ -182,7 +252,7 @@ def test_is_covering():
 
 
 def test_is_tiling():
-    ### view region chr1p is tiled by one interval, chr1q is tiled by two
+    #   view region chr1p is tiled by one interval, chr1q is tiled by two
     df1 = pd.DataFrame(
         [
             ["chr1", 0, 9, "chr1p"],
@@ -194,7 +264,35 @@ def test_is_tiling():
     chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")]
     assert is_tiling(df1, chromsizes)
 
-    ### not tiling, since (chr1,0,9) is associated with chr1q
+    #   testing for non-standard column names
+    df1 = pd.DataFrame(
+        [
+            ["chr1", 0, 9, "chr1p"],
+            ["chr1", 11, 12, "chr1q"],
+            ["chr1", 12, 20, "chr1q"],
+        ],
+        columns=["chrom1", "start1", "end1", "view_region"],
+    )
+    chromsizes = pd.DataFrame(
+        [
+            ["chr1", 0, 9, "chr1p"],
+            ["chr1", 11, 20, "chr1q"]
+        ],
+        columns=["CHROM", "START", "END", "NAME"],
+        )
+    assert is_tiling(
+                        df1,
+                        chromsizes,
+                        cols=["chrom1", "start1", "end1"],
+                        cols_view=["CHROM", "START", "END"],
+                        view_name_col="NAME"
+                    )
+
+    with pytest.raises(KeyError):
+        # cols and view_cols are not passed as an arguments
+        is_tiling(df1, chromsizes)
+
+    #  not tiling, since (chr1,0,9) is associated with chr1q
     df1 = pd.DataFrame(
         [
             ["chr1", 0, 9, "chr1q"],
@@ -206,7 +304,7 @@ def test_is_tiling():
     chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")]
     assert not is_tiling(df1, chromsizes)
 
-    ### not tiling, contains overlaps
+    #   not tiling, contains overlaps
     df1 = pd.DataFrame(
         [
             ["chr1", 0, 9, "chr1p"],
@@ -218,7 +316,7 @@ def test_is_tiling():
     chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")]
     assert not is_tiling(df1, chromsizes)
 
-    ### not tiling, since it doesn't cover
+    #   not tiling, since it doesn't cover
     df1 = pd.DataFrame(
         [
             ["chr1", 11, 12, "chr1q"],
@@ -231,7 +329,7 @@ def test_is_tiling():
 
 
 def test_is_bedframe():
-    ##missing a column
+    #   missing a column
     df1 = pd.DataFrame(
         [
             ["chr1", 11],
@@ -241,7 +339,7 @@ def test_is_bedframe():
     )
     assert not is_bedframe(df1)
 
-    ### end column has invalid dtype
+    #   end column has invalid dtype
     df1 = pd.DataFrame(
         [
             ["chr1", 10, "20"],
@@ -251,7 +349,7 @@ def test_is_bedframe():
     )
     assert not is_bedframe(df1)
 
-    ### second interval start > ends.
+    #   second interval start > ends.
     df1 = pd.DataFrame(
         [
             ["chr1", 10, 20],
@@ -261,7 +359,7 @@ def test_is_bedframe():
     )
     assert not is_bedframe(df1)
 
-    ### third interval has a null in one column
+    #   third interval has a null in one column
     df1 = pd.DataFrame(
         [
             ["chr1", 10, 20, "first"],
@@ -278,7 +376,7 @@ def test_is_bedframe():
     with pytest.raises(ValueError):
         is_bedframe(df1, raise_errors=True)
 
-    ### first interval is completely NA
+    # first interval is completely NA
     df1 = pd.DataFrame(
         [
             [pd.NA, pd.NA, pd.NA, "first"],
@@ -363,6 +461,29 @@ def test_is_sorted():
         view_df, view_df=view_df, view_name_col="fruit", df_view_col="fruit"
     )
 
+    # testing for non-standard column names
+    view_df = pd.DataFrame(
+        [
+            ["chrX", 1, 8, "oranges"],
+            ["chrX", 8, 20, "grapefruit"],
+            ["chr1", 0, 10, "apples"],
+        ],
+        columns=["CHROM", "START", "END", "FRUIT"],
+    )
+
+    assert is_sorted(
+        view_df,
+        view_df=view_df,
+        view_name_col="FRUIT",
+        df_view_col="FRUIT",
+        cols=["CHROM", "START", "END"],
+        cols_view=["CHROM", "START", "END"]
+    )
+
+    with pytest.raises(ValueError):
+        # cols and view_cols are not passed as an arguments
+        is_sorted(view_df, view_df=view_df)
+
     df = pd.DataFrame(
         [
             ["chr1", 0, 10, "+"],
@@ -375,6 +496,14 @@ def test_is_sorted():
 
     assert not is_sorted(df)
 
+    view_df = pd.DataFrame(
+        [
+            ["chrX", 1, 8, "oranges"],
+            ["chrX", 8, 20, "grapefruit"],
+            ["chr1", 0, 10, "apples"],
+        ],
+        columns=["chrom", "start", "end", "fruit"],
+    )
     bfs = sort_bedframe(df, view_df=view_df, view_name_col="fruit")
 
     assert is_sorted(bfs, view_df=view_df, view_name_col="fruit")