From bf0416ab3a7bd4bc4594915e3e6f27bb06fe05b4 Mon Sep 17 00:00:00 2001 From: smit kadvani Date: Fri, 19 Jan 2024 13:55:13 -0800 Subject: [PATCH] core.checks functions now support custom cols, df_view_cols, cols_view (#182) * cols, df_view_col, and cols_view arguments now passed to downstream functions in core.checks, enabling user-defined column arguments for: `is_covering`, `is_tiling`, `is_sorted`, `is_contained` * positive & negative test case covering these arguments added for: `is_covering`, `is_tiling`, `is_sorted`, `is_contained` closes #126 --- bioframe/core/checks.py | 147 +++++++++++++++++++++----------- tests/test_core_checks.py | 175 +++++++++++++++++++++++++++++++++----- 2 files changed, 249 insertions(+), 73 deletions(-) diff --git a/bioframe/core/checks.py b/bioframe/core/checks.py index 7b0aced..8256eca 100644 --- a/bioframe/core/checks.py +++ b/bioframe/core/checks.py @@ -29,7 +29,8 @@ def is_bedframe( - chrom, start, end columns - columns have valid dtypes - - for each interval, if any of chrom, start, end are null, then all are null + - for each interval, if any of chrom, start, end are null, then all are + null - all starts < ends. Parameters @@ -61,7 +62,8 @@ def is_bedframe( raise TypeError("Invalid bedFrame: Invalid column names") return False - if not _verify_column_dtypes(df, cols=[ck1, sk1, ek1], return_as_bool=True): + if not _verify_column_dtypes(df, cols=[ck1, sk1, ek1], + return_as_bool=True): if raise_errors: raise TypeError("Invalid bedFrame: Invalid column dtypes") return False @@ -77,18 +79,18 @@ def is_bedframe( if ((df[ek1] - df[sk1]) < 0).any(): if raise_errors: - raise ValueError( - f"Invalid bedframe: starts exceed ends for " - f"{sum((df[ek1] - df[sk1]) < 0)} intervals" - ) + raise ValueError(f"Invalid bedframe: starts exceed ends for " + f"{sum((df[ek1] - df[sk1]) < 0)} intervals") return False return True -def is_cataloged( - df, view_df, raise_errors=False, df_view_col="view_region", view_name_col="name" -): +def is_cataloged(df, + view_df, + raise_errors=False, + df_view_col="view_region", + view_name_col="name"): """ Tests if all region names in `df[df_view_col]` are present in `view_df[view_name_col]`. @@ -125,20 +127,19 @@ def is_cataloged( if not _verify_columns(view_df, [view_name_col], return_as_bool=True): if raise_errors: - raise ValueError(f"Could not find `{view_name_col}` column in view_df") + raise ValueError(f"Could not find \ + `{view_name_col}` \ + column in view_df") return False if not set(df[df_view_col].copy().dropna().values).issubset( - set(view_df[view_name_col].values) - ): + set(view_df[view_name_col].values)): if raise_errors: missing_regions = set(df[df_view_col].values).difference( - set(view_df[view_name_col].values) - ) + set(view_df[view_name_col].values)) raise ValueError( f"The following regions in df[df_view_col] not in " - f"view_df[view_name_col]: \n{missing_regions}" - ) + f"view_df[view_name_col]: \n{missing_regions}") return False return True @@ -171,7 +172,8 @@ def is_overlapping(df, cols=None): df_merged = merge(df, cols=cols) total_interval_len = np.sum((df[ek1] - df[sk1]).values) - total_interval_len_merged = np.sum((df_merged[ek1] - df_merged[sk1]).values) + total_interval_len_merged = np.sum( + (df_merged[ek1] - df_merged[sk1]).values) if total_interval_len > total_interval_len_merged: return True @@ -179,7 +181,10 @@ def is_overlapping(df, cols=None): return False -def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None): +def is_viewframe(region_df, + raise_errors=False, + view_name_col="name", + cols=None): """ Checks that `region_df` is a valid viewFrame. @@ -218,9 +223,8 @@ def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None) ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols - if not _verify_columns( - region_df, [ck1, sk1, ek1, view_name_col], return_as_bool=True - ): + if not _verify_columns(region_df, [ck1, sk1, ek1, view_name_col], + return_as_bool=True): if raise_errors: raise TypeError("Invalid view: invalid column names") return False @@ -235,11 +239,11 @@ def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None) raise ValueError("Invalid view: cannot contain NAs") return False - if len(set(region_df[view_name_col])) < len(region_df[view_name_col].values): + if len(set(region_df[view_name_col])) < \ + len(region_df[view_name_col].values): if raise_errors: - raise ValueError( - "Invalid view: entries in region_df[view_name_col] must be unique" - ) + raise ValueError("Invalid view: entries in \ + region_df[view_name_col] must be unique") return False if is_overlapping(region_df, cols=cols): @@ -257,6 +261,7 @@ def is_contained( df_view_col=None, view_name_col="name", cols=None, + cols_view=None, ): """ Tests if all genomic intervals in a bioframe `df` are cataloged and do not @@ -277,8 +282,13 @@ def is_contained( Column from df used to associate interviews with view regions. Default `view_region`. + view_name_col: + Column from view_df with view region names. Default `name`. + cols: (str, str, str) Column names for chrom, start, end in df. + cols_view: (str, str, str) + Column names for chrom, start, end in view_df. Returns ------- @@ -286,16 +296,22 @@ def is_contained( """ from ..ops import trim - ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols - + ck2, sk2, ek2 = _get_default_colnames() if cols_view is None else cols_view if df_view_col is None: try: - df_view_assigned = ops.overlap(df, view_df) - assert (df_view_assigned["end_"].isna()).sum() == 0 - assert (df_view_assigned["start_"].isna()).sum() == 0 - assert (df_view_assigned["end"] <= df_view_assigned["end_"]).all() - assert (df_view_assigned["start"] >= df_view_assigned["start_"]).all() + df_view_assigned = ops.overlap(df, + view_df, + cols1=cols, + cols2=cols_view) + # ek2 = end_ is the default value + assert (df_view_assigned[ek2 + "_"].isna()).sum() == 0 + # sk2 = start_ is the default value + assert (df_view_assigned[sk2 + "_"].isna()).sum() == 0 + assert (df_view_assigned[ek1] <= df_view_assigned[ek2 + "_"]).all() + # ek1 = end is the default value + # sk1 = start is the default value + assert (df_view_assigned[sk1] >= df_view_assigned[sk2 + "_"]).all() except AssertionError: if raise_errors: raise AssertionError("df not contained in view_df") @@ -304,15 +320,18 @@ def is_contained( return True if not is_cataloged( - df, view_df, df_view_col=df_view_col, view_name_col=view_name_col - ): + df, view_df, df_view_col=df_view_col, view_name_col=view_name_col): if raise_errors: raise ValueError("df not cataloged in view_df") return False - df_trim = trim( - df, view_df=view_df, df_view_col=df_view_col, view_name_col=view_name_col - ) + df_trim = trim(df, + view_df=view_df, + df_view_col=df_view_col, + view_name_col=view_name_col, + cols=cols, + cols_view=cols_view) + is_start_trimmed = np.any(df[sk1].values != df_trim[sk1].values) is_end_trimmed = np.any(df[ek1].values != df_trim[ek1].values) @@ -324,7 +343,7 @@ def is_contained( return True -def is_covering(df, view_df, view_name_col="name", cols=None): +def is_covering(df, view_df, view_name_col="name", cols=None, cols_view=None): """ Tests if a view `view_df` is covered by the set of genomic intervals in the bedframe `df`. @@ -348,6 +367,12 @@ def is_covering(df, view_df, view_name_col="name", cols=None): genomic intervals, provided separately for each set. The default values are 'chrom', 'start', 'end'. + cols_view: (str, str, str) or None + The names of columns containing the chromosome, start and end of the + genomic intervals in view_df, provided separately for + each set. The default + values are 'chrom', 'start', 'end'. + Returns ------- is_covering:bool @@ -356,10 +381,11 @@ def is_covering(df, view_df, view_name_col="name", cols=None): from ..ops import complement if complement( - df, - view_df=view_df, - view_name_col=view_name_col, - cols=cols, + df, + view_df=view_df, + view_name_col=view_name_col, + cols=cols, + cols_view=cols_view, ).empty: return True else: @@ -373,6 +399,7 @@ def is_tiling( df_view_col="view_region", view_name_col="name", cols=None, + cols_view=None, ): """ Tests if a view `view_df` is tiled by the set of genomic intervals in the @@ -405,6 +432,11 @@ def is_tiling( The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set. The default values are 'chrom', 'start', 'end'. + cols_view: (str, str, str) or None + The names of columns containing the chromosome, start and end of the + genomic intervals in view_df, provided + separately for each set. The default + values are 'chrom', 'start', 'end'. Returns ------- @@ -412,21 +444,28 @@ def is_tiling( """ - view_df = construction.make_viewframe( - view_df, view_name_col=view_name_col, cols=cols - ) + view_df = construction.make_viewframe(view_df, + view_name_col=view_name_col, + cols=cols_view) - if is_overlapping(df): + if is_overlapping(df, cols=cols): if raise_errors: raise ValueError("overlaps") return False - if not is_covering(df, view_df, view_name_col=view_name_col, cols=None): + if not is_covering(df, + view_df, + view_name_col=view_name_col, + cols=cols, + cols_view=cols_view): if raise_errors: raise ValueError("not covered") return False - if not is_contained( - df, view_df, df_view_col=df_view_col, view_name_col=view_name_col, cols=None - ): + if not is_contained(df, + view_df, + df_view_col=df_view_col, + view_name_col=view_name_col, + cols=cols, + cols_view=cols_view): if raise_errors: raise ValueError("not contained") return False @@ -440,6 +479,7 @@ def is_sorted( df_view_col=None, view_name_col="name", cols=None, + cols_view=None, ): """ Tests if a bedframe is changed by sorting. @@ -472,6 +512,12 @@ def is_sorted( genomic intervals, provided separately for each set. The default values are 'chrom', 'start', 'end'. + cols_view: (str, str, str) or None + The names of columns containing the chromosome, start and end of the + genomic intervals in view_df, provided separately for each set. + The default + values are 'chrom', 'start', 'end'. + Returns ------- is_sorted : bool @@ -486,6 +532,7 @@ def is_sorted( df_view_col=df_view_col, view_name_col=view_name_col, cols=cols, + cols_view=cols_view, ) if df.equals(df_sorted): diff --git a/tests/test_core_checks.py b/tests/test_core_checks.py index 03ce543..7f7ace4 100644 --- a/tests/test_core_checks.py +++ b/tests/test_core_checks.py @@ -18,7 +18,7 @@ def test_is_cataloged(): - ### chr2q is not in view + # chr2q is not in view view_df = pd.DataFrame( [ ["chr1", 0, 12, "chr1p"], @@ -38,7 +38,7 @@ def test_is_cataloged(): ) assert not is_cataloged(df, view_df) - ### chr1q is in view, df_view_col and view_name_col have funny labels. + # chr1q is in view, df_view_col and view_name_col have funny labels. view_df = pd.DataFrame( [ ["chr1", 0, 12, "chr1p"], @@ -56,7 +56,10 @@ def test_is_cataloged(): columns=["chrom", "start", "end", "funny_view_region"], ) assert is_cataloged( - df, view_df, df_view_col="funny_view_region", view_name_col="funny_name" + df, + view_df, + df_view_col="funny_view_region", + view_name_col="funny_name" ) @@ -71,7 +74,7 @@ def test_is_contained(): columns=["chrom", "start", "end", "name"], ) - ### not contained because chr2q is not cataloged + # not contained because chr2q is not cataloged df = pd.DataFrame( [ ["chr1", 0, 12, "chr1p"], @@ -82,7 +85,7 @@ def test_is_contained(): ) assert not is_contained(df, view_df, df_view_col="view_region") - ### not contained because second interval falls outside the view regions + # not contained because second interval falls outside the view regions df = pd.DataFrame( [ ["chr1", 14, 15, "chr1p"], @@ -104,10 +107,49 @@ def test_is_contained(): # is contained, because assignments are inferred assert is_contained(df, view_df) + # using previous assignments with non-standard column names + # in dataframes without argument + view_df = pd.DataFrame( + [ + ["chr1", 0, 12, "chr1p"], + ["chr1", 13, 26, "chr1q"], + ["chrX", 1, 8, "chrX_0"], + ], + columns=["CHROM", "START", "END", "NAME"], + ) + df = pd.DataFrame( + [ + ["chr1", 12, 12, "chr1p"], + ["chr1", 13, 14, "chr1q"], + ["chrX", 1, 8, "chrX_0"], + ], + columns=["chrom1", "start1", "end1", "VIEW_REGION"], + ) + assert is_contained( + df, + view_df, + cols=["chrom1", "start1", "end1"], + cols_view=["CHROM", "START", "END"], + df_view_col="VIEW_REGION", + view_name_col="NAME" + ) + + with pytest.raises(TypeError): + # cols and view_cols are not passed as an arguments + is_contained(df, view_df, raise_errors=True) + # is not contained, because assignments are not inferred + view_df = pd.DataFrame( + [ + ["chr1", 0, 12, "chr1p"], + ["chr1", 13, 26, "chr1q"], + ["chrX", 1, 8, "chrX_0"], + ], + columns=["chrom", "start", "end", "name"], + ) assert not is_contained(df, view_df, df_view_col="view_region") - ### second interval falls completely out of the view + # second interval falls completely out of the view df = pd.DataFrame( [ ["chr1", 12, 12, "chr1p"], @@ -125,7 +167,7 @@ def test_is_contained(): def test_is_overlapping(): - ### interval on chr1 overlaps + # interval on chr1 overlaps d = """chrom start end 0 chr1 3 6 1 chr1 5 10 @@ -133,7 +175,7 @@ def test_is_overlapping(): df = pd.read_csv(StringIO(d), sep=r"\s+") assert is_overlapping(df) - ### adjacent intervals do not overlap + # adjacent intervals do not overlap d = """chrom start end 0 chr1 3 6 1 chr1 6 10 @@ -143,8 +185,8 @@ def test_is_overlapping(): def test_is_covering(): - ### test is_covering where an interval from df completely overlaps - ### two different regions from view + # test is_covering where an interval from df completely overlaps + # two different regions from view df1 = pd.DataFrame( [ ["chr1", -5, 25], @@ -154,8 +196,8 @@ def test_is_covering(): chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")] assert is_covering(df1, chromsizes) - ### test is_covering where two intervals from df overlap - ### two different regions from view + # test is_covering where two intervals from df overlap + # two different regions from view df1 = pd.DataFrame( [ ["chr1", -5, 10], @@ -167,8 +209,36 @@ def test_is_covering(): chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")] assert is_covering(df1, chromsizes) - ### test is_covering where two intervals from df overlap - ### two different regions from view + # test is_covering with non-standard columns names + df1 = pd.DataFrame( + [ + ["chr1", -5, 10], + ["chr1", 11, 12], + ["chr1", 12, 20], + ], + columns=["chrom1", "start1", "end1"], + ) + chromsizes = pd.DataFrame( + [ + ["chr1", 0, 9, "chr1p"], + ["chr1", 11, 20, "chr1q"] + ], + columns=["CHROM", "START", "END", "NAME"], + ) + assert is_covering( + df1, + chromsizes, + cols=["chrom1", "start1", "end1"], + cols_view=["CHROM", "START", "END"], + view_name_col="NAME" + ) + + with pytest.raises(ValueError): + # cols and view_cols are not passed as an arguments + is_covering(df1, chromsizes) + + # test is_covering where two intervals from df overlap + # two different regions from view df1 = pd.DataFrame( [ ["chr1", -5, 10, "chr1q"], @@ -182,7 +252,7 @@ def test_is_covering(): def test_is_tiling(): - ### view region chr1p is tiled by one interval, chr1q is tiled by two + # view region chr1p is tiled by one interval, chr1q is tiled by two df1 = pd.DataFrame( [ ["chr1", 0, 9, "chr1p"], @@ -194,7 +264,35 @@ def test_is_tiling(): chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")] assert is_tiling(df1, chromsizes) - ### not tiling, since (chr1,0,9) is associated with chr1q + # testing for non-standard column names + df1 = pd.DataFrame( + [ + ["chr1", 0, 9, "chr1p"], + ["chr1", 11, 12, "chr1q"], + ["chr1", 12, 20, "chr1q"], + ], + columns=["chrom1", "start1", "end1", "view_region"], + ) + chromsizes = pd.DataFrame( + [ + ["chr1", 0, 9, "chr1p"], + ["chr1", 11, 20, "chr1q"] + ], + columns=["CHROM", "START", "END", "NAME"], + ) + assert is_tiling( + df1, + chromsizes, + cols=["chrom1", "start1", "end1"], + cols_view=["CHROM", "START", "END"], + view_name_col="NAME" + ) + + with pytest.raises(KeyError): + # cols and view_cols are not passed as an arguments + is_tiling(df1, chromsizes) + + # not tiling, since (chr1,0,9) is associated with chr1q df1 = pd.DataFrame( [ ["chr1", 0, 9, "chr1q"], @@ -206,7 +304,7 @@ def test_is_tiling(): chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")] assert not is_tiling(df1, chromsizes) - ### not tiling, contains overlaps + # not tiling, contains overlaps df1 = pd.DataFrame( [ ["chr1", 0, 9, "chr1p"], @@ -218,7 +316,7 @@ def test_is_tiling(): chromsizes = [("chr1", 0, 9, "chr1p"), ("chr1", 11, 20, "chr1q")] assert not is_tiling(df1, chromsizes) - ### not tiling, since it doesn't cover + # not tiling, since it doesn't cover df1 = pd.DataFrame( [ ["chr1", 11, 12, "chr1q"], @@ -231,7 +329,7 @@ def test_is_tiling(): def test_is_bedframe(): - ##missing a column + # missing a column df1 = pd.DataFrame( [ ["chr1", 11], @@ -241,7 +339,7 @@ def test_is_bedframe(): ) assert not is_bedframe(df1) - ### end column has invalid dtype + # end column has invalid dtype df1 = pd.DataFrame( [ ["chr1", 10, "20"], @@ -251,7 +349,7 @@ def test_is_bedframe(): ) assert not is_bedframe(df1) - ### second interval start > ends. + # second interval start > ends. df1 = pd.DataFrame( [ ["chr1", 10, 20], @@ -261,7 +359,7 @@ def test_is_bedframe(): ) assert not is_bedframe(df1) - ### third interval has a null in one column + # third interval has a null in one column df1 = pd.DataFrame( [ ["chr1", 10, 20, "first"], @@ -278,7 +376,7 @@ def test_is_bedframe(): with pytest.raises(ValueError): is_bedframe(df1, raise_errors=True) - ### first interval is completely NA + # first interval is completely NA df1 = pd.DataFrame( [ [pd.NA, pd.NA, pd.NA, "first"], @@ -363,6 +461,29 @@ def test_is_sorted(): view_df, view_df=view_df, view_name_col="fruit", df_view_col="fruit" ) + # testing for non-standard column names + view_df = pd.DataFrame( + [ + ["chrX", 1, 8, "oranges"], + ["chrX", 8, 20, "grapefruit"], + ["chr1", 0, 10, "apples"], + ], + columns=["CHROM", "START", "END", "FRUIT"], + ) + + assert is_sorted( + view_df, + view_df=view_df, + view_name_col="FRUIT", + df_view_col="FRUIT", + cols=["CHROM", "START", "END"], + cols_view=["CHROM", "START", "END"] + ) + + with pytest.raises(ValueError): + # cols and view_cols are not passed as an arguments + is_sorted(view_df, view_df=view_df) + df = pd.DataFrame( [ ["chr1", 0, 10, "+"], @@ -375,6 +496,14 @@ def test_is_sorted(): assert not is_sorted(df) + view_df = pd.DataFrame( + [ + ["chrX", 1, 8, "oranges"], + ["chrX", 8, 20, "grapefruit"], + ["chr1", 0, 10, "apples"], + ], + columns=["chrom", "start", "end", "fruit"], + ) bfs = sort_bedframe(df, view_df=view_df, view_name_col="fruit") assert is_sorted(bfs, view_df=view_df, view_name_col="fruit")