From f44f6ff6c610554540204b912dc98101412b5974 Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Fri, 2 Feb 2024 13:01:17 -0800 Subject: [PATCH 01/18] convert point to pseudo segment when both groups are non-empty --- bioframe/core/arrops.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/bioframe/core/arrops.py b/bioframe/core/arrops.py index dc191f1..5c94913 100644 --- a/bioframe/core/arrops.py +++ b/bioframe/core/arrops.py @@ -264,6 +264,23 @@ def _overlap_intervals_legacy(starts1, ends1, starts2, ends2, closed=False, sort return overlap_ids +def get_pseudo_segment(starts, ends): + """ + Get pseudo-segment for overlapping intervals. + + Parameters + ---------- + starts, ends : numpy.ndarray + + Returns + ------- + pseudo_ends : numpy.ndarray + An array of pseudo-ends for overlapping intervals. + + """ + pseudo_ends = ends.copy() + pseudo_ends[ends == starts] += 1 + return [starts, pseudo_ends] def overlap_intervals(starts1, ends1, starts2, ends2, closed=False, sort=False): """ @@ -296,8 +313,11 @@ def overlap_intervals(starts1, ends1, starts2, ends2, closed=False, sort=False): starts1 = np.asarray(starts1) ends1 = np.asarray(ends1) + starts1, ends1 = get_pseudo_segment(starts1, ends1) + starts2 = np.asarray(starts2) ends2 = np.asarray(ends2) + starts2, ends2 = get_pseudo_segment(starts2, ends2) # Concatenate intervals lists n1 = len(starts1) From 35bc7c46a21674c7be267482142aec848de8cf24 Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Fri, 2 Feb 2024 13:25:08 -0800 Subject: [PATCH 02/18] debug statements removed --- tests/test_ops.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/tests/test_ops.py b/tests/test_ops.py index aacce4e..c53c128 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -427,6 +427,94 @@ def test_overlap(): ) assert len(b) == 3 + ### test overlap with point and segment data + df1 = pd.DataFrame( + [ + ['chr1', 1, 1] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + df2 = pd.DataFrame( + [ + ['chr1', 1, 2] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + b = bioframe.overlap( + df1, + df2, + on=None, + how="left", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index_"].values)) == 0 + b = bioframe.overlap( + df2, + df1, + on=None, + how="left", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index_"].values)) == 0 + + b = bioframe.overlap( + df1, + df2, + on=None, + how="right", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index"].values)) == 0 + b = bioframe.overlap( + df2, + df1, + on=None, + how="right", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index"].values)) == 0 + + ### Two adjacent point should not overlap with each other + df1 = pd.DataFrame( + [ + ['chr1', 1, 1] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + df2 = pd.DataFrame( + [ + ['chr1', 2, 2] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + b = bioframe.overlap( + df1, + df2, + on=None, + how="left", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index_"].values)) == 1 + b = bioframe.overlap( + df2, + df1, + on=None, + how="left", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index_"].values)) == 1 + + ### test keep_order and NA handling df1 = pd.DataFrame( [ From 424d3a1609f960612591bcf04a2e3a983874d99c Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Fri, 2 Feb 2024 13:42:38 -0800 Subject: [PATCH 03/18] Linting errors fixed --- bioframe/core/arrops.py | 5 ++--- tests/test_ops.py | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/bioframe/core/arrops.py b/bioframe/core/arrops.py index 5c94913..8fab73d 100644 --- a/bioframe/core/arrops.py +++ b/bioframe/core/arrops.py @@ -267,7 +267,7 @@ def _overlap_intervals_legacy(starts1, ends1, starts2, ends2, closed=False, sort def get_pseudo_segment(starts, ends): """ Get pseudo-segment for overlapping intervals. - + Parameters ---------- starts, ends : numpy.ndarray @@ -275,8 +275,7 @@ def get_pseudo_segment(starts, ends): Returns ------- pseudo_ends : numpy.ndarray - An array of pseudo-ends for overlapping intervals. - + An array of pseudo-ends for overlapping intervals. """ pseudo_ends = ends.copy() pseudo_ends[ends == starts] += 1 diff --git a/tests/test_ops.py b/tests/test_ops.py index c53c128..f13bc2e 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -431,14 +431,14 @@ def test_overlap(): df1 = pd.DataFrame( [ ['chr1', 1, 1] - ], + ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) df2 = pd.DataFrame( [ ['chr1', 1, 2] - ], + ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) @@ -484,14 +484,14 @@ def test_overlap(): df1 = pd.DataFrame( [ ['chr1', 1, 1] - ], + ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) df2 = pd.DataFrame( [ ['chr1', 2, 2] - ], + ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) From 0203910f7e561e0f84205ef7b1e8421e1c26150d Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Fri, 2 Feb 2024 17:34:09 -0800 Subject: [PATCH 04/18] test cased for subtractions added --- tests/test_ops.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/test_ops.py b/tests/test_ops.py index f13bc2e..3cc2612 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -1491,6 +1491,73 @@ def test_subtract(): .sort_values(["chrom", "start", "end"]) .reset_index(drop=True) ) + pd.testing.assert_frame_equal( + df_result.astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}), + bioframe.subtract(df1, df2) + .sort_values(["chrom", "start", "end"]) + .reset_index(drop=True), + ) + + # Test the case when substraction from point bioframe + df1 = pd.DataFrame( + [ + ['chr1', 1, 1] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + df2 = pd.DataFrame( + [ + ['chr1', 0, 2] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + df_result = ( + pd.DataFrame( + [ + ["chr1", 0, 1], + ["chr1", 1, 2], + ], + columns=["chrom", "start", "end"], + ) + .sort_values(["chrom", "start", "end"]) + .reset_index(drop=True) + ) + + pd.testing.assert_frame_equal( + df_result.astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}), + bioframe.subtract(df1, df2) + .sort_values(["chrom", "start", "end"]) + .reset_index(drop=True), + ) + + # Test the case when substraction from point is at the beginning bioframe + + df1 = pd.DataFrame( + [ + ['chr1', 1, 1] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + df2 = pd.DataFrame( + [ + ['chr1', 1, 2] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + df_result = ( + pd.DataFrame( + [ + ["chr1", 1, 2], + ], + columns=["chrom", "start", "end"], + ) + .sort_values(["chrom", "start", "end"]) + .reset_index(drop=True) + ) pd.testing.assert_frame_equal( df_result.astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}), From af63ad4664929b1bdb3c9e8a8480e829750d6443 Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Fri, 2 Feb 2024 17:44:13 -0800 Subject: [PATCH 05/18] typo in test cases --- tests/test_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index 3cc2612..b73ef43 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -1527,7 +1527,7 @@ def test_subtract(): pd.testing.assert_frame_equal( df_result.astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}), - bioframe.subtract(df1, df2) + bioframe.subtract(df2, df1) .sort_values(["chrom", "start", "end"]) .reset_index(drop=True), ) @@ -1561,7 +1561,7 @@ def test_subtract(): pd.testing.assert_frame_equal( df_result.astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}), - bioframe.subtract(df1, df2) + bioframe.subtract(df2, df1) .sort_values(["chrom", "start", "end"]) .reset_index(drop=True), ) From a6e124dc02c34daecd1be4674187bfdc8e314b88 Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Fri, 2 Feb 2024 17:48:08 -0800 Subject: [PATCH 06/18] linting errors fixed --- tests/test_ops.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index b73ef43..9760c66 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -1502,14 +1502,14 @@ def test_subtract(): df1 = pd.DataFrame( [ ['chr1', 1, 1] - ], + ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) df2 = pd.DataFrame( [ ['chr1', 0, 2] - ], + ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) @@ -1524,7 +1524,6 @@ def test_subtract(): .sort_values(["chrom", "start", "end"]) .reset_index(drop=True) ) - pd.testing.assert_frame_equal( df_result.astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}), bioframe.subtract(df2, df1) @@ -1537,14 +1536,14 @@ def test_subtract(): df1 = pd.DataFrame( [ ['chr1', 1, 1] - ], + ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) df2 = pd.DataFrame( [ ['chr1', 1, 2] - ], + ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) From fad2b306d2f13287ae8e0e50e6262493c0932349 Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Thu, 8 Feb 2024 19:12:31 -0800 Subject: [PATCH 07/18] function name updated --- bioframe/core/arrops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bioframe/core/arrops.py b/bioframe/core/arrops.py index 8fab73d..755454d 100644 --- a/bioframe/core/arrops.py +++ b/bioframe/core/arrops.py @@ -264,7 +264,7 @@ def _overlap_intervals_legacy(starts1, ends1, starts2, ends2, closed=False, sort return overlap_ids -def get_pseudo_segment(starts, ends): +def _convert_points_to_len1_segments(starts, ends): """ Get pseudo-segment for overlapping intervals. @@ -312,11 +312,11 @@ def overlap_intervals(starts1, ends1, starts2, ends2, closed=False, sort=False): starts1 = np.asarray(starts1) ends1 = np.asarray(ends1) - starts1, ends1 = get_pseudo_segment(starts1, ends1) + starts1, ends1 = _convert_points_to_len1_segments(starts1, ends1) starts2 = np.asarray(starts2) ends2 = np.asarray(ends2) - starts2, ends2 = get_pseudo_segment(starts2, ends2) + starts2, ends2 = _convert_points_to_len1_segments(starts2, ends2) # Concatenate intervals lists n1 = len(starts1) From e6df7563161658ccc3a15da72f1993760dde9a3c Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Thu, 8 Feb 2024 19:26:28 -0800 Subject: [PATCH 08/18] test case added to check overlap between point and segment adjacent to eachother --- tests/test_ops.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tests/test_ops.py b/tests/test_ops.py index 9760c66..f29f35a 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -514,6 +514,78 @@ def test_overlap(): ) assert np.sum(pd.isna(b["index_"].values)) == 1 + ### Point adjacent to the end of the segment should not + ### overlap with the segment + df1 = pd.DataFrame( + [ + ['chr1', 1, 2] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + df2 = pd.DataFrame( + [ + ['chr1', 2, 2] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + b = bioframe.overlap( + df1, + df2, + on=None, + how="left", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index_"].values)) == 1 + + b = bioframe.overlap( + df2, + df1, + on=None, + how="left", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index_"].values)) == 1 + + ### Point adjacent to the start of the segment should + ### overlap with the segment + df1 = pd.DataFrame( + [ + ['chr1', 1, 1] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + df2 = pd.DataFrame( + [ + ['chr1', 1, 2] + ], + columns=['chrom','start','end'] + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + b = bioframe.overlap( + df1, + df2, + on=None, + how="left", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index_"].values)) == 0 + + b = bioframe.overlap( + df2, + df1, + on=None, + how="left", + return_index=True, + return_input=False, + ) + assert np.sum(pd.isna(b["index_"].values)) == 0 + ### test keep_order and NA handling df1 = pd.DataFrame( @@ -2043,3 +2115,7 @@ def test_sort_bedframe(): assert ( df.dtypes == bioframe.sort_bedframe(df, view_df, view_name_col="fruit").dtypes ).all() + + +if __name__ == '__main__': + test_overlap() \ No newline at end of file From e00becf34f1d61b928856e87cfd79af22a168a71 Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Thu, 8 Feb 2024 19:33:10 -0800 Subject: [PATCH 09/18] lint fixed --- tests/test_ops.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index f29f35a..138432c 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -550,7 +550,7 @@ def test_overlap(): ) assert np.sum(pd.isna(b["index_"].values)) == 1 - ### Point adjacent to the start of the segment should + ### Point adjacent to the start of the segment should ### overlap with the segment df1 = pd.DataFrame( [ @@ -2115,7 +2115,3 @@ def test_sort_bedframe(): assert ( df.dtypes == bioframe.sort_bedframe(df, view_df, view_name_col="fruit").dtypes ).all() - - -if __name__ == '__main__': - test_overlap() \ No newline at end of file From bf4d6c0a5b7e183b2bc7618a4e15477ba878714a Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Thu, 8 Feb 2024 20:14:21 -0800 Subject: [PATCH 10/18] comments added for reference : --- bioframe/core/arrops.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bioframe/core/arrops.py b/bioframe/core/arrops.py index 755454d..3db146b 100644 --- a/bioframe/core/arrops.py +++ b/bioframe/core/arrops.py @@ -312,10 +312,14 @@ def overlap_intervals(starts1, ends1, starts2, ends2, closed=False, sort=False): starts1 = np.asarray(starts1) ends1 = np.asarray(ends1) + ### Convert single points to length-1 segments to handle + ### expected behaviour semi-open intervals starts1, ends1 = _convert_points_to_len1_segments(starts1, ends1) starts2 = np.asarray(starts2) ends2 = np.asarray(ends2) + ### Convert single points to length-1 segments to handle + ### expected behaviour semi-open intervals starts2, ends2 = _convert_points_to_len1_segments(starts2, ends2) # Concatenate intervals lists From 3893ae5c6fcca5d249ab1457312d541ab4aeb7da Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Thu, 8 Feb 2024 20:15:16 -0800 Subject: [PATCH 11/18] intended usecase/problem for point to len1 segment conversion added in doc --- docs/guide-definitions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guide-definitions.rst b/docs/guide-definitions.rst index addc610..8119ff4 100644 --- a/docs/guide-definitions.rst +++ b/docs/guide-definitions.rst @@ -7,7 +7,7 @@ Interval: - An *interval* is a tuple of integers (start, end) with start <= end. - Coordinates are assumed to be 0-based and intervals half-open (1-based ends) i.e. [start, end). - An interval has a *length* equal to (end - start). - - A special case where start and end are the same, i.e. [X, X), is interpreted as a *point* (aka an *empty interval*, i.e. an edge between 1-bp bins). A point has zero length. + - A special case where start and end are the same, i.e. [X, X), is interpreted as a *point* (aka an *empty interval*, i.e. an edge between 1-bp bins). A point has zero length. For consistency of the operations such as subtract, overlap, etc. point intervals are treated as len1 segments. - Negative coordinates are permissible for both ends of an interval. Properties of a pair of intervals: From 42251765095c5e8adf219ad4df7e800a83d9d4ab Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Tue, 13 Feb 2024 16:32:08 -0800 Subject: [PATCH 12/18] comments removed at function call and comment corrected in function defination --- bioframe/core/arrops.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/bioframe/core/arrops.py b/bioframe/core/arrops.py index 3db146b..ac84daf 100644 --- a/bioframe/core/arrops.py +++ b/bioframe/core/arrops.py @@ -266,8 +266,9 @@ def _overlap_intervals_legacy(starts1, ends1, starts2, ends2, closed=False, sort def _convert_points_to_len1_segments(starts, ends): """ - Get pseudo-segment for overlapping intervals. - + Convert points to len1 segments for internal use in overlap(). + This enables desired overlap behavior for points and preserves + behavior for semi-open intervals of len>=1. Parameters ---------- starts, ends : numpy.ndarray @@ -312,14 +313,10 @@ def overlap_intervals(starts1, ends1, starts2, ends2, closed=False, sort=False): starts1 = np.asarray(starts1) ends1 = np.asarray(ends1) - ### Convert single points to length-1 segments to handle - ### expected behaviour semi-open intervals starts1, ends1 = _convert_points_to_len1_segments(starts1, ends1) starts2 = np.asarray(starts2) ends2 = np.asarray(ends2) - ### Convert single points to length-1 segments to handle - ### expected behaviour semi-open intervals starts2, ends2 = _convert_points_to_len1_segments(starts2, ends2) # Concatenate intervals lists From defb24cf279a2dba12b09a47de845d902ec2c63b Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Tue, 13 Feb 2024 16:38:25 -0800 Subject: [PATCH 13/18] doc modification reverted --- docs/guide-definitions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guide-definitions.rst b/docs/guide-definitions.rst index 8119ff4..addc610 100644 --- a/docs/guide-definitions.rst +++ b/docs/guide-definitions.rst @@ -7,7 +7,7 @@ Interval: - An *interval* is a tuple of integers (start, end) with start <= end. - Coordinates are assumed to be 0-based and intervals half-open (1-based ends) i.e. [start, end). - An interval has a *length* equal to (end - start). - - A special case where start and end are the same, i.e. [X, X), is interpreted as a *point* (aka an *empty interval*, i.e. an edge between 1-bp bins). A point has zero length. For consistency of the operations such as subtract, overlap, etc. point intervals are treated as len1 segments. + - A special case where start and end are the same, i.e. [X, X), is interpreted as a *point* (aka an *empty interval*, i.e. an edge between 1-bp bins). A point has zero length. - Negative coordinates are permissible for both ends of an interval. Properties of a pair of intervals: From 58d071d6a180568bcbea4ebc1a912576d8a0e910 Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Tue, 13 Feb 2024 16:41:36 -0800 Subject: [PATCH 14/18] lint corrected --- bioframe/core/arrops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bioframe/core/arrops.py b/bioframe/core/arrops.py index ac84daf..01933ce 100644 --- a/bioframe/core/arrops.py +++ b/bioframe/core/arrops.py @@ -266,8 +266,8 @@ def _overlap_intervals_legacy(starts1, ends1, starts2, ends2, closed=False, sort def _convert_points_to_len1_segments(starts, ends): """ - Convert points to len1 segments for internal use in overlap(). - This enables desired overlap behavior for points and preserves + Convert points to len1 segments for internal use in overlap(). + This enables desired overlap behavior for points and preserves behavior for semi-open intervals of len>=1. Parameters ---------- From 11c3352fc7210f0150261523195e78455d792c96 Mon Sep 17 00:00:00 2001 From: smitkadvani Date: Tue, 13 Feb 2024 17:48:12 -0800 Subject: [PATCH 15/18] readable variable names used --- tests/test_ops.py | 63 ++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index 138432c..e473970 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -428,14 +428,14 @@ def test_overlap(): assert len(b) == 3 ### test overlap with point and segment data - df1 = pd.DataFrame( + df_point1 = pd.DataFrame( [ ['chr1', 1, 1] ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) - df2 = pd.DataFrame( + df_segment12 = pd.DataFrame( [ ['chr1', 1, 2] ], @@ -443,17 +443,19 @@ def test_overlap(): ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) b = bioframe.overlap( - df1, - df2, + df_point1, + df_segment12, on=None, how="left", return_index=True, return_input=False, ) assert np.sum(pd.isna(b["index_"].values)) == 0 + + ### test for changed order of input point and segment b = bioframe.overlap( - df2, - df1, + df_segment12, + df_point1, on=None, how="left", return_index=True, @@ -461,18 +463,21 @@ def test_overlap(): ) assert np.sum(pd.isna(b["index_"].values)) == 0 + ### test for overlap with point and segment with right method b = bioframe.overlap( - df1, - df2, + df_point1, + df_segment12, on=None, how="right", return_index=True, return_input=False, ) assert np.sum(pd.isna(b["index"].values)) == 0 + + ### test for swapped order of input point and segment b = bioframe.overlap( - df2, - df1, + df_segment12, + df_point1, on=None, how="right", return_index=True, @@ -481,14 +486,14 @@ def test_overlap(): assert np.sum(pd.isna(b["index"].values)) == 0 ### Two adjacent point should not overlap with each other - df1 = pd.DataFrame( + df_point1 = pd.DataFrame( [ ['chr1', 1, 1] ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) - df2 = pd.DataFrame( + df_point2 = pd.DataFrame( [ ['chr1', 2, 2] ], @@ -496,17 +501,19 @@ def test_overlap(): ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) b = bioframe.overlap( - df1, - df2, + df_point1, + df_point2, on=None, how="left", return_index=True, return_input=False, ) assert np.sum(pd.isna(b["index_"].values)) == 1 + + ### test for changed order of input point b = bioframe.overlap( - df2, - df1, + df_point2, + df_point1, on=None, how="left", return_index=True, @@ -516,14 +523,14 @@ def test_overlap(): ### Point adjacent to the end of the segment should not ### overlap with the segment - df1 = pd.DataFrame( + df_segment12 = pd.DataFrame( [ ['chr1', 1, 2] ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) - df2 = pd.DataFrame( + df_point2 = pd.DataFrame( [ ['chr1', 2, 2] ], @@ -531,8 +538,8 @@ def test_overlap(): ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) b = bioframe.overlap( - df1, - df2, + df_segment12, + df_point2, on=None, how="left", return_index=True, @@ -541,8 +548,8 @@ def test_overlap(): assert np.sum(pd.isna(b["index_"].values)) == 1 b = bioframe.overlap( - df2, - df1, + df_point2, + df_segment12, on=None, how="left", return_index=True, @@ -552,14 +559,14 @@ def test_overlap(): ### Point adjacent to the start of the segment should ### overlap with the segment - df1 = pd.DataFrame( + df_point1 = pd.DataFrame( [ ['chr1', 1, 1] ], columns=['chrom','start','end'] ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) - df2 = pd.DataFrame( + df_segment12 = pd.DataFrame( [ ['chr1', 1, 2] ], @@ -567,8 +574,8 @@ def test_overlap(): ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) b = bioframe.overlap( - df1, - df2, + df_point1, + df_segment12, on=None, how="left", return_index=True, @@ -577,8 +584,8 @@ def test_overlap(): assert np.sum(pd.isna(b["index_"].values)) == 0 b = bioframe.overlap( - df2, - df1, + df_segment12, + df_point1, on=None, how="left", return_index=True, From 36a973872ba346ef5c460bcac6054c3d5b369322 Mon Sep 17 00:00:00 2001 From: dns-smitk <162182212+dns-smitk@users.noreply.github.com> Date: Fri, 15 Mar 2024 09:29:40 -0500 Subject: [PATCH 16/18] test cases are more readable --- tests/test_ops.py | 128 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 33 deletions(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index e473970..d516d31 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -8,6 +8,7 @@ import bioframe.core.checks as checks from bioframe.core.construction import make_viewframe + # import pyranges as pr # def bioframe_to_pyranges(df): @@ -433,7 +434,7 @@ def test_overlap(): ['chr1', 1, 1] ], columns=['chrom','start','end'] - ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + ).astype({ "chrom": "object", "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) df_segment12 = pd.DataFrame( [ @@ -447,10 +448,17 @@ def test_overlap(): df_segment12, on=None, how="left", - return_index=True, - return_input=False, + return_index=False, + return_input=True, ) - assert np.sum(pd.isna(b["index_"].values)) == 0 + df_expected = pd.DataFrame([ + ['chr1', 1, 1, None, pd.NA, pd.NA]], + columns=['chrom', 'start', + 'end', 'chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) + ### test for changed order of input point and segment b = bioframe.overlap( @@ -458,10 +466,17 @@ def test_overlap(): df_point1, on=None, how="left", - return_index=True, - return_input=False, + return_index=False, + return_input=True, ) - assert np.sum(pd.isna(b["index_"].values)) == 0 + + df_expected = pd.DataFrame([ + ['chr1', 1, 2, 'chr1', 1, 1]], + columns=['chrom', 'start', + 'end', 'chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) ### test for overlap with point and segment with right method b = bioframe.overlap( @@ -469,10 +484,16 @@ def test_overlap(): df_segment12, on=None, how="right", - return_index=True, - return_input=False, + return_index=False, + return_input=True, ) - assert np.sum(pd.isna(b["index"].values)) == 0 + df_expected = pd.DataFrame([ + [None, pd.NA, pd.NA, 'chr1', 1, 2]], + columns=['chrom', 'start', + 'end', 'chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) ### test for swapped order of input point and segment b = bioframe.overlap( @@ -480,10 +501,16 @@ def test_overlap(): df_point1, on=None, how="right", - return_index=True, - return_input=False, + return_index=False, + return_input=True, ) - assert np.sum(pd.isna(b["index"].values)) == 0 + df_expected = pd.DataFrame([ + ['chr1', 1, 2,'chr1', 1, 1]], + columns=['chrom', 'start', + 'end', 'chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) ### Two adjacent point should not overlap with each other df_point1 = pd.DataFrame( @@ -505,10 +532,16 @@ def test_overlap(): df_point2, on=None, how="left", - return_index=True, - return_input=False, + return_index=False, + return_input=True, ) - assert np.sum(pd.isna(b["index_"].values)) == 1 + df_expected = pd.DataFrame([ + ['chr1', 1, 1, None, pd.NA, pd.NA]], + columns=['chrom', 'start', + 'end', 'chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) ### test for changed order of input point b = bioframe.overlap( @@ -516,10 +549,16 @@ def test_overlap(): df_point1, on=None, how="left", - return_index=True, - return_input=False, + return_index=False, + return_input=True, ) - assert np.sum(pd.isna(b["index_"].values)) == 1 + df_expected = pd.DataFrame([ + ['chr1', 2, 2, None, pd.NA, pd.NA]], + columns=['chrom', 'start', + 'end', 'chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) ### Point adjacent to the end of the segment should not ### overlap with the segment @@ -542,20 +581,32 @@ def test_overlap(): df_point2, on=None, how="left", - return_index=True, - return_input=False, + return_index=False, + return_input=True, ) - assert np.sum(pd.isna(b["index_"].values)) == 1 + df_expected = pd.DataFrame([ + ['chr1', 1, 2, None, pd.NA, pd.NA]], + columns=['chrom', 'start', + 'end', 'chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) b = bioframe.overlap( df_point2, df_segment12, on=None, how="left", - return_index=True, - return_input=False, + return_index=False, + return_input=True, ) - assert np.sum(pd.isna(b["index_"].values)) == 1 + df_expected = pd.DataFrame([ + ['chr1', 2, 2, None, pd.NA, pd.NA]], + columns=['chrom', 'start', + 'end','chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) ### Point adjacent to the start of the segment should ### overlap with the segment @@ -578,21 +629,32 @@ def test_overlap(): df_segment12, on=None, how="left", - return_index=True, - return_input=False, + return_index=False, + return_input=True, ) - assert np.sum(pd.isna(b["index_"].values)) == 0 + df_expected = pd.DataFrame([ + ['chr1', 1, 1, None, pd.NA, pd.NA]], + columns=['chrom', 'start', + 'end', 'chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) b = bioframe.overlap( df_segment12, df_point1, on=None, how="left", - return_index=True, - return_input=False, - ) - assert np.sum(pd.isna(b["index_"].values)) == 0 - + return_index=False, + return_input=True, + ) + df_expected = pd.DataFrame([ + ['chr1', 1, 2, 'chr1', 1, 1]], + columns=['chrom', 'start', + 'end', 'chrom_', 'start_', 'end_']).astype( + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) + pd.testing.assert_frame_equal(df_expected, b) ### test keep_order and NA handling df1 = pd.DataFrame( From fd3574bb5e7e1dbdc8d107a8bd97c18201d5b6c2 Mon Sep 17 00:00:00 2001 From: dns-smitk <162182212+dns-smitk@users.noreply.github.com> Date: Fri, 15 Mar 2024 09:48:11 -0500 Subject: [PATCH 17/18] lint fixed: --- tests/test_ops.py | 62 +++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index d516d31..a5cfcd7 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -8,7 +8,6 @@ import bioframe.core.checks as checks from bioframe.core.construction import make_viewframe - # import pyranges as pr # def bioframe_to_pyranges(df): @@ -452,14 +451,13 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 1, 1, None, pd.NA, pd.NA]], - columns=['chrom', 'start', + ['chr1', 1, 1, None, pd.NA, pd.NA]], + columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) - ### test for changed order of input point and segment b = bioframe.overlap( df_segment12, @@ -471,10 +469,10 @@ def test_overlap(): ) df_expected = pd.DataFrame([ - ['chr1', 1, 2, 'chr1', 1, 1]], - columns=['chrom', 'start', + ['chr1', 1, 2, 'chr1', 1, 1]], + columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) @@ -488,10 +486,10 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - [None, pd.NA, pd.NA, 'chr1', 1, 2]], - columns=['chrom', 'start', + [None, pd.NA, pd.NA, 'chr1', 1, 2]], + columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) @@ -505,10 +503,10 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 1, 2,'chr1', 1, 1]], - columns=['chrom', 'start', + ['chr1', 1, 2,'chr1', 1, 1]], + columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) @@ -536,10 +534,10 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 1, 1, None, pd.NA, pd.NA]], - columns=['chrom', 'start', + ['chr1', 1, 1, None, pd.NA, pd.NA]], + columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) @@ -553,10 +551,10 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 2, 2, None, pd.NA, pd.NA]], - columns=['chrom', 'start', + ['chr1', 2, 2, None, pd.NA, pd.NA]], + columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) @@ -585,10 +583,10 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 1, 2, None, pd.NA, pd.NA]], - columns=['chrom', 'start', + ['chr1', 1, 2, None, pd.NA, pd.NA]], + columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) @@ -601,10 +599,10 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 2, 2, None, pd.NA, pd.NA]], - columns=['chrom', 'start', + ['chr1', 2, 2, None, pd.NA, pd.NA]], + columns=['chrom', 'start', 'end','chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) @@ -633,10 +631,10 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 1, 1, None, pd.NA, pd.NA]], - columns=['chrom', 'start', + ['chr1', 1, 1, None, pd.NA, pd.NA]], + columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) @@ -649,10 +647,10 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 1, 2, 'chr1', 1, 1]], - columns=['chrom', 'start', + ['chr1', 1, 2, 'chr1', 1, 1]], + columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( - {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), + {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), "start_": pd.Int64Dtype(), "end_": pd.Int64Dtype()}) pd.testing.assert_frame_equal(df_expected, b) From ab80bf74a164fb869ab597115d3836cd4c0d7736 Mon Sep 17 00:00:00 2001 From: dns-smitk Date: Mon, 18 Mar 2024 02:03:50 -0500 Subject: [PATCH 18/18] test case expected result corrected --- tests/test_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index a5cfcd7..d132157 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -451,7 +451,7 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 1, 1, None, pd.NA, pd.NA]], + ['chr1', 1, 1, 'chr1', 1, 2]], columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), @@ -486,7 +486,7 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - [None, pd.NA, pd.NA, 'chr1', 1, 2]], + ['chr1', 1, 1, 'chr1', 1, 2]], columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(), @@ -631,7 +631,7 @@ def test_overlap(): return_input=True, ) df_expected = pd.DataFrame([ - ['chr1', 1, 1, None, pd.NA, pd.NA]], + ['chr1', 1, 1, 'chr1', 1, 2]], columns=['chrom', 'start', 'end', 'chrom_', 'start_', 'end_']).astype( {"start": pd.Int64Dtype(), "end": pd.Int64Dtype(),