Skip to content

Commit

Permalink
core.checks functions now support custom cols, df_view_cols, cols_view (
Browse files Browse the repository at this point in the history
#182)

* cols, df_view_col, and cols_view arguments now passed to downstream functions in core.checks, enabling user-defined column arguments for: `is_covering`, `is_tiling`, `is_sorted`, `is_contained`
* positive & negative test case covering these arguments added for: `is_covering`, `is_tiling`, `is_sorted`, `is_contained`
closes #126
  • Loading branch information
smitkadvani authored Jan 19, 2024
1 parent 933c92b commit bf0416a
Show file tree
Hide file tree
Showing 2 changed files with 249 additions and 73 deletions.
147 changes: 97 additions & 50 deletions bioframe/core/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def is_bedframe(
- chrom, start, end columns
- columns have valid dtypes
- for each interval, if any of chrom, start, end are null, then all are null
- for each interval, if any of chrom, start, end are null, then all are
null
- all starts < ends.
Parameters
Expand Down Expand Up @@ -61,7 +62,8 @@ def is_bedframe(
raise TypeError("Invalid bedFrame: Invalid column names")
return False

if not _verify_column_dtypes(df, cols=[ck1, sk1, ek1], return_as_bool=True):
if not _verify_column_dtypes(df, cols=[ck1, sk1, ek1],
return_as_bool=True):
if raise_errors:
raise TypeError("Invalid bedFrame: Invalid column dtypes")
return False
Expand All @@ -77,18 +79,18 @@ def is_bedframe(

if ((df[ek1] - df[sk1]) < 0).any():
if raise_errors:
raise ValueError(
f"Invalid bedframe: starts exceed ends for "
f"{sum((df[ek1] - df[sk1]) < 0)} intervals"
)
raise ValueError(f"Invalid bedframe: starts exceed ends for "
f"{sum((df[ek1] - df[sk1]) < 0)} intervals")
return False

return True


def is_cataloged(
df, view_df, raise_errors=False, df_view_col="view_region", view_name_col="name"
):
def is_cataloged(df,
view_df,
raise_errors=False,
df_view_col="view_region",
view_name_col="name"):
"""
Tests if all region names in `df[df_view_col]` are present in
`view_df[view_name_col]`.
Expand Down Expand Up @@ -125,20 +127,19 @@ def is_cataloged(

if not _verify_columns(view_df, [view_name_col], return_as_bool=True):
if raise_errors:
raise ValueError(f"Could not find `{view_name_col}` column in view_df")
raise ValueError(f"Could not find \
`{view_name_col}` \
column in view_df")
return False

if not set(df[df_view_col].copy().dropna().values).issubset(
set(view_df[view_name_col].values)
):
set(view_df[view_name_col].values)):
if raise_errors:
missing_regions = set(df[df_view_col].values).difference(
set(view_df[view_name_col].values)
)
set(view_df[view_name_col].values))
raise ValueError(
f"The following regions in df[df_view_col] not in "
f"view_df[view_name_col]: \n{missing_regions}"
)
f"view_df[view_name_col]: \n{missing_regions}")
return False

return True
Expand Down Expand Up @@ -171,15 +172,19 @@ def is_overlapping(df, cols=None):
df_merged = merge(df, cols=cols)

total_interval_len = np.sum((df[ek1] - df[sk1]).values)
total_interval_len_merged = np.sum((df_merged[ek1] - df_merged[sk1]).values)
total_interval_len_merged = np.sum(
(df_merged[ek1] - df_merged[sk1]).values)

if total_interval_len > total_interval_len_merged:
return True
else:
return False


def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None):
def is_viewframe(region_df,
raise_errors=False,
view_name_col="name",
cols=None):
"""
Checks that `region_df` is a valid viewFrame.
Expand Down Expand Up @@ -218,9 +223,8 @@ def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None)

ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols

if not _verify_columns(
region_df, [ck1, sk1, ek1, view_name_col], return_as_bool=True
):
if not _verify_columns(region_df, [ck1, sk1, ek1, view_name_col],
return_as_bool=True):
if raise_errors:
raise TypeError("Invalid view: invalid column names")
return False
Expand All @@ -235,11 +239,11 @@ def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None)
raise ValueError("Invalid view: cannot contain NAs")
return False

if len(set(region_df[view_name_col])) < len(region_df[view_name_col].values):
if len(set(region_df[view_name_col])) < \
len(region_df[view_name_col].values):
if raise_errors:
raise ValueError(
"Invalid view: entries in region_df[view_name_col] must be unique"
)
raise ValueError("Invalid view: entries in \
region_df[view_name_col] must be unique")
return False

if is_overlapping(region_df, cols=cols):
Expand All @@ -257,6 +261,7 @@ def is_contained(
df_view_col=None,
view_name_col="name",
cols=None,
cols_view=None,
):
"""
Tests if all genomic intervals in a bioframe `df` are cataloged and do not
Expand All @@ -277,25 +282,36 @@ def is_contained(
Column from df used to associate interviews with view regions.
Default `view_region`.
view_name_col:
Column from view_df with view region names. Default `name`.
cols: (str, str, str)
Column names for chrom, start, end in df.
cols_view: (str, str, str)
Column names for chrom, start, end in view_df.
Returns
-------
is_contained:bool
"""
from ..ops import trim

ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols

ck2, sk2, ek2 = _get_default_colnames() if cols_view is None else cols_view
if df_view_col is None:
try:
df_view_assigned = ops.overlap(df, view_df)
assert (df_view_assigned["end_"].isna()).sum() == 0
assert (df_view_assigned["start_"].isna()).sum() == 0
assert (df_view_assigned["end"] <= df_view_assigned["end_"]).all()
assert (df_view_assigned["start"] >= df_view_assigned["start_"]).all()
df_view_assigned = ops.overlap(df,
view_df,
cols1=cols,
cols2=cols_view)
# ek2 = end_ is the default value
assert (df_view_assigned[ek2 + "_"].isna()).sum() == 0
# sk2 = start_ is the default value
assert (df_view_assigned[sk2 + "_"].isna()).sum() == 0
assert (df_view_assigned[ek1] <= df_view_assigned[ek2 + "_"]).all()
# ek1 = end is the default value
# sk1 = start is the default value
assert (df_view_assigned[sk1] >= df_view_assigned[sk2 + "_"]).all()
except AssertionError:
if raise_errors:
raise AssertionError("df not contained in view_df")
Expand All @@ -304,15 +320,18 @@ def is_contained(
return True

if not is_cataloged(
df, view_df, df_view_col=df_view_col, view_name_col=view_name_col
):
df, view_df, df_view_col=df_view_col, view_name_col=view_name_col):
if raise_errors:
raise ValueError("df not cataloged in view_df")
return False

df_trim = trim(
df, view_df=view_df, df_view_col=df_view_col, view_name_col=view_name_col
)
df_trim = trim(df,
view_df=view_df,
df_view_col=df_view_col,
view_name_col=view_name_col,
cols=cols,
cols_view=cols_view)

is_start_trimmed = np.any(df[sk1].values != df_trim[sk1].values)
is_end_trimmed = np.any(df[ek1].values != df_trim[ek1].values)

Expand All @@ -324,7 +343,7 @@ def is_contained(
return True


def is_covering(df, view_df, view_name_col="name", cols=None):
def is_covering(df, view_df, view_name_col="name", cols=None, cols_view=None):
"""
Tests if a view `view_df` is covered by the set of genomic intervals in
the bedframe `df`.
Expand All @@ -348,6 +367,12 @@ def is_covering(df, view_df, view_name_col="name", cols=None):
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
cols_view: (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals in view_df, provided separately for
each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
is_covering:bool
Expand All @@ -356,10 +381,11 @@ def is_covering(df, view_df, view_name_col="name", cols=None):
from ..ops import complement

if complement(
df,
view_df=view_df,
view_name_col=view_name_col,
cols=cols,
df,
view_df=view_df,
view_name_col=view_name_col,
cols=cols,
cols_view=cols_view,
).empty:
return True
else:
Expand All @@ -373,6 +399,7 @@ def is_tiling(
df_view_col="view_region",
view_name_col="name",
cols=None,
cols_view=None,
):
"""
Tests if a view `view_df` is tiled by the set of genomic intervals in the
Expand Down Expand Up @@ -405,28 +432,40 @@ def is_tiling(
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
cols_view: (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals in view_df, provided
separately for each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
is_tiling:bool
"""

view_df = construction.make_viewframe(
view_df, view_name_col=view_name_col, cols=cols
)
view_df = construction.make_viewframe(view_df,
view_name_col=view_name_col,
cols=cols_view)

if is_overlapping(df):
if is_overlapping(df, cols=cols):
if raise_errors:
raise ValueError("overlaps")
return False
if not is_covering(df, view_df, view_name_col=view_name_col, cols=None):
if not is_covering(df,
view_df,
view_name_col=view_name_col,
cols=cols,
cols_view=cols_view):
if raise_errors:
raise ValueError("not covered")
return False
if not is_contained(
df, view_df, df_view_col=df_view_col, view_name_col=view_name_col, cols=None
):
if not is_contained(df,
view_df,
df_view_col=df_view_col,
view_name_col=view_name_col,
cols=cols,
cols_view=cols_view):
if raise_errors:
raise ValueError("not contained")
return False
Expand All @@ -440,6 +479,7 @@ def is_sorted(
df_view_col=None,
view_name_col="name",
cols=None,
cols_view=None,
):
"""
Tests if a bedframe is changed by sorting.
Expand Down Expand Up @@ -472,6 +512,12 @@ def is_sorted(
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
cols_view: (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals in view_df, provided separately for each set.
The default
values are 'chrom', 'start', 'end'.
Returns
-------
is_sorted : bool
Expand All @@ -486,6 +532,7 @@ def is_sorted(
df_view_col=df_view_col,
view_name_col=view_name_col,
cols=cols,
cols_view=cols_view,
)

if df.equals(df_sorted):
Expand Down
Loading

0 comments on commit bf0416a

Please sign in to comment.