From 2a8a66bfc2cfa1cd8063666d00e89940ed3dc11e Mon Sep 17 00:00:00 2001 From: Justin Xu Date: Tue, 25 Jun 2024 07:51:47 +0100 Subject: [PATCH 1/8] First commit for event index --- src/aces/aggregate.py | 62 +++++++++++++++++++++++-------------- src/aces/extract_subtree.py | 31 +++++++++++-------- src/aces/predicates.py | 52 ++++++++++++++++++------------- src/aces/types.py | 7 +++++ tests/test_e2e.py | 8 +++++ 5 files changed, 102 insertions(+), 58 deletions(-) diff --git a/src/aces/aggregate.py b/src/aces/aggregate.py index 86dcbb9..769de1b 100644 --- a/src/aces/aggregate.py +++ b/src/aces/aggregate.py @@ -4,7 +4,12 @@ import polars as pl -from .types import PRED_CNT_TYPE, TemporalWindowBounds, ToEventWindowBounds +from .types import ( + EVENT_INDEX_COLUMN, + PRED_CNT_TYPE, + TemporalWindowBounds, + ToEventWindowBounds, +) def aggregate_temporal_window( @@ -79,7 +84,7 @@ def aggregate_temporal_window( ... "is_C": [1, 1, 0, 0, 1, 0], ... }) >>> aggregate_temporal_window(df, TemporalWindowBounds( - ... True, timedelta(days=7), True, None)) + ... True, timedelta(days=7), True, None)).drop("_EVENT_INDEX") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -94,7 +99,7 @@ def aggregate_temporal_window( │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-10 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ >>> aggregate_temporal_window(df, ( - ... True, timedelta(days=1), True, timedelta(days=0))) + ... True, timedelta(days=1), True, timedelta(days=0))).drop("_EVENT_INDEX") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -109,7 +114,7 @@ def aggregate_temporal_window( │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ >>> aggregate_temporal_window(df, ( - ... True, timedelta(days=1), False, timedelta(days=0))) + ... True, timedelta(days=1), False, timedelta(days=0))).drop("_EVENT_INDEX") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -124,7 +129,7 @@ def aggregate_temporal_window( │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ >>> aggregate_temporal_window(df, ( - ... False, timedelta(days=1), False, timedelta(days=0))) + ... False, timedelta(days=1), False, timedelta(days=0))).drop("_EVENT_INDEX") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -139,7 +144,7 @@ def aggregate_temporal_window( │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ >>> aggregate_temporal_window(df, ( - ... False, timedelta(days=-1), False, timedelta(days=0))) + ... False, timedelta(days=-1), False, timedelta(days=0))).drop("_EVENT_INDEX") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -154,7 +159,7 @@ def aggregate_temporal_window( │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-02 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ >>> aggregate_temporal_window(df, ( - ... False, timedelta(hours=12), False, timedelta(hours=12))) + ... False, timedelta(hours=12), False, timedelta(hours=12))).drop("_EVENT_INDEX") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -173,7 +178,7 @@ def aggregate_temporal_window( >>> # the earliest event in the aggregation window, regardless of whether that is earlier than the >>> # timestamp of the row. >>> aggregate_temporal_window(df, ( - ... False, timedelta(days=-1), True, timedelta(days=1))) + ... False, timedelta(days=-1), True, timedelta(days=1))).drop("_EVENT_INDEX") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -188,7 +193,7 @@ def aggregate_temporal_window( │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ >>> aggregate_temporal_window(df, ( - ... True, timedelta(days=-1), False, timedelta(days=1))) + ... True, timedelta(days=-1), False, timedelta(days=1))).drop("_EVENT_INDEX") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -206,7 +211,9 @@ def aggregate_temporal_window( if not isinstance(endpoint_expr, TemporalWindowBounds): endpoint_expr = TemporalWindowBounds(*endpoint_expr) - predicate_cols = [c for c in predicates_df.columns if c not in {"subject_id", "timestamp"}] + predicate_cols = [ + c for c in predicates_df.columns if c not in {"subject_id", "timestamp", EVENT_INDEX_COLUMN} + ] return ( predicates_df.rolling( @@ -216,6 +223,7 @@ def aggregate_temporal_window( ) .agg( *[pl.col(c).sum().cast(PRED_CNT_TYPE).alias(c) for c in predicate_cols], + pl.col(EVENT_INDEX_COLUMN).max(), ) .sort(by=["subject_id", "timestamp"]) .select( @@ -226,6 +234,7 @@ def aggregate_temporal_window( "timestamp_at_end" ), *predicate_cols, + EVENT_INDEX_COLUMN, ) ) @@ -302,11 +311,13 @@ def aggregate_event_bound_window( ... datetime(year=1989, month=12, day=8, hour=16, minute=22), ... datetime(year=1989, month=12, day=10, hour=3, minute=7), # HAS EVENT BOUND ... ], + ... "_EVENT_INDEX": [0, 1, 2, 0, 1, 2, 3, 4], ... "is_A": [1, 0, 1, 1, 1, 1, 0, 0], ... "is_B": [0, 1, 0, 1, 0, 1, 1, 1], ... "is_C": [0, 1, 0, 0, 0, 1, 0, 1], ... }) - >>> aggregate_event_bound_window(df, ToEventWindowBounds(True, "is_C", True, None)) + >>> aggregate_event_bound_window(df, ToEventWindowBounds(True, "is_C", True, None)).drop( + ... "_EVENT_INDEX") shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -322,7 +333,8 @@ def aggregate_event_bound_window( │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 │ │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ - >>> aggregate_event_bound_window(df, ToEventWindowBounds(True, "is_C", False, None)) + >>> aggregate_event_bound_window(df, ToEventWindowBounds(True, "is_C", False, None)).drop( + ... "_EVENT_INDEX") shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -338,7 +350,8 @@ def aggregate_event_bound_window( │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 │ │ 2 ┆ 1989-12-10 03:07:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ - >>> aggregate_event_bound_window(df, ToEventWindowBounds(False, "is_C", True, None)) + >>> aggregate_event_bound_window(df, ToEventWindowBounds(False, "is_C", True, None)).drop( + ... "_EVENT_INDEX") shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -354,7 +367,8 @@ def aggregate_event_bound_window( │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ - >>> aggregate_event_bound_window(df, ToEventWindowBounds(True, "is_C", True, timedelta(days=3))) + >>> aggregate_event_bound_window(df, ToEventWindowBounds( + ... True, "is_C", True, timedelta(days=3))).drop("_EVENT_INDEX") shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -495,6 +509,7 @@ def boolean_expr_bound_sum( ... datetime(year=1989, month=12, day=8, hour=16, minute=22), ... datetime(year=1989, month=12, day=10, hour=3, minute=7), # HAS EVENT BOUND ... ], + ... "_EVENT_INDEX": [0, 1, 2, 0, 1, 2, 3, 4], ... "idx": [0, 1, 2, 3, 4, 5, 6, 7], ... "is_A": [1, 0, 1, 1, 1, 1, 0, 0], ... "is_B": [0, 1, 0, 1, 0, 1, 1, 1], @@ -675,7 +690,7 @@ def boolean_expr_bound_sum( ... "bound_to_row", ... "both", ... offset = timedelta(days=3), - ... ).drop("idx") + ... ).drop(["idx", "_EVENT_INDEX"]) shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -697,7 +712,7 @@ def boolean_expr_bound_sum( ... "bound_to_row", ... "left", ... offset = timedelta(days=3), - ... ).drop("idx") + ... ).drop(["idx", "_EVENT_INDEX"]) shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -719,7 +734,7 @@ def boolean_expr_bound_sum( ... "bound_to_row", ... "none", ... timedelta(days=-3), - ... ).drop("idx") + ... ).drop(["idx", "_EVENT_INDEX"]) shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -741,7 +756,7 @@ def boolean_expr_bound_sum( ... "bound_to_row", ... "right", ... offset = timedelta(days=-3), - ... ).drop("idx") + ... ).drop(["idx", "_EVENT_INDEX"]) shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -763,7 +778,7 @@ def boolean_expr_bound_sum( ... "row_to_bound", ... "both", ... offset = timedelta(days=3), - ... ).drop("idx") + ... ).drop(["idx", "_EVENT_INDEX"]) shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -785,7 +800,7 @@ def boolean_expr_bound_sum( ... "row_to_bound", ... "left", ... offset = timedelta(days=3), - ... ).drop("idx") + ... ).drop(["idx", "_EVENT_INDEX"]) shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -807,7 +822,7 @@ def boolean_expr_bound_sum( ... "row_to_bound", ... "none", ... offset = timedelta(days=-3), - ... ).drop("idx") + ... ).drop(["idx", "_EVENT_INDEX"]) shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -829,7 +844,7 @@ def boolean_expr_bound_sum( ... "row_to_bound", ... "right", ... offset = timedelta(days=-3), - ... ).drop("idx") + ... ).drop(["idx", "_EVENT_INDEX"]) shape: (8, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -891,7 +906,7 @@ def boolean_expr_bound_sum( ), ) - cols = [c for c in df.columns if c not in {"subject_id", "timestamp"}] + cols = [c for c in df.columns if c not in {"subject_id", "timestamp", EVENT_INDEX_COLUMN}] cumsum_cols = {c: pl.col(c).cum_sum().over("subject_id").alias(f"{c}_cumsum_at_row") for c in cols} df = df.with_columns(*cumsum_cols.values()) @@ -1024,4 +1039,5 @@ def agg_offset_fn(c: str) -> pl.Expr: st_timestamp_expr.alias("timestamp_at_start"), end_timestamp_expr.alias("timestamp_at_end"), *(agg_offset_fn(c).cast(PRED_CNT_TYPE, strict=False).fill_null(0).alias(c) for c in cols), + EVENT_INDEX_COLUMN, ) diff --git a/src/aces/extract_subtree.py b/src/aces/extract_subtree.py index 334f197..bfb3592 100644 --- a/src/aces/extract_subtree.py +++ b/src/aces/extract_subtree.py @@ -8,6 +8,7 @@ from .aggregate import aggregate_event_bound_window, aggregate_temporal_window from .constraints import check_constraints +from .types import EVENT_INDEX_COLUMN, LAST_EVENT_INDEX_COLUMN def extract_subtree( @@ -133,6 +134,7 @@ def extract_subtree( ... datetime(year=1999, month=12, day=6, hour=15, minute=17), # Admission ... datetime(year=1999, month=12, day=6, hour=16, minute=22), # Discharge ... ], + ... "_EVENT_INDEX": [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2], ... "is_admission": [0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0], ... "is_discharge": [0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1], ... "is_death": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], @@ -144,18 +146,18 @@ def extract_subtree( ... .rename({"timestamp": "subtree_anchor_timestamp"}) ... ).select("subject_id", "subtree_anchor_timestamp") >>> print(subtreee_anchor_realizations) - shape: (5, 2) - ┌────────────┬──────────────────────────┐ - │ subject_id ┆ subtree_anchor_timestamp │ - │ --- ┆ --- │ - │ i64 ┆ datetime[μs] │ - ╞════════════╪══════════════════════════╡ - │ 1 ┆ 1989-12-03 13:14:00 │ - │ 1 ┆ 1989-12-23 03:12:00 │ - │ 2 ┆ 1983-12-02 12:03:00 │ - │ 2 ┆ 1989-12-06 15:17:00 │ - │ 3 ┆ 1999-12-06 15:17:00 │ - └────────────┴──────────────────────────┘ + shape: (5, 3) + ┌────────────┬──────────────────────────┬──────────────┐ + │ subject_id ┆ subtree_anchor_timestamp ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ i64 │ + ╞════════════╪══════════════════════════╪══════════════╡ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1 │ + │ 1 ┆ 1989-12-23 03:12:00 ┆ 4 │ + │ 2 ┆ 1983-12-02 12:03:00 ┆ 1 │ + │ 2 ┆ 1989-12-06 15:17:00 ┆ 3 │ + │ 3 ┆ 1999-12-06 15:17:00 ┆ 1 │ + └────────────┴──────────────────────────┴──────────────┘ >>> out = extract_subtree(root, subtreee_anchor_realizations, predicates_df, timedelta(0)) >>> out.select( ... "subject_id", @@ -243,7 +245,9 @@ def extract_subtree( └─────────────────────┴─────────────────────┴──────────────┴──────────────┴──────────┴─────────────┘ """ recursive_results = [] - predicate_cols = [c for c in predicates_df.columns if c not in {"subject_id", "timestamp"}] + predicate_cols = [ + c for c in predicates_df.columns if c not in {"subject_id", "timestamp", EVENT_INDEX_COLUMN} + ] if not subtree.children: return subtree_anchor_realizations @@ -326,6 +330,7 @@ def extract_subtree( pl.lit(child.name).alias("window_name"), "timestamp_at_start", "timestamp_at_end", + pl.col(EVENT_INDEX_COLUMN).alias(LAST_EVENT_INDEX_COLUMN), *predicate_cols, ).alias(f"{child.name}_summary"), ) diff --git a/src/aces/predicates.py b/src/aces/predicates.py index c286650..7976840 100644 --- a/src/aces/predicates.py +++ b/src/aces/predicates.py @@ -10,6 +10,8 @@ from .types import ( ANY_EVENT_COLUMN, END_OF_RECORD_KEY, + EVENT_INDEX_COLUMN, + EVENT_INDEX_TYPE, PRED_CNT_TYPE, START_OF_RECORD_KEY, ) @@ -485,17 +487,17 @@ def get_predicates_df(cfg: TaskExtractorConfig, data_config: DictConfig) -> pl.D ... "path": str(data_path), "standard": "direct", "ts_format": "%m/%d/%Y %H:%M" ... }) ... get_predicates_df(config, data_config) - shape: (4, 7) - ┌────────────┬─────────────────────┬─────┬─────┬───────┬──────────────┬────────────┐ - │ subject_id ┆ timestamp ┆ adm ┆ dis ┆ death ┆ death_or_dis ┆ _ANY_EVENT │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════╪═════╪═══════╪══════════════╪════════════╡ - │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ - │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ - │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ - └────────────┴─────────────────────┴─────┴─────┴───────┴──────────────┴────────────┘ + shape: (4, 8) + ┌────────────┬─────────────────────┬─────┬─────┬───────┬──────────────┬────────────┬──────────────┐ + │ subject_id ┆ timestamp ┆ adm ┆ dis ┆ death ┆ death_or_dis ┆ _ANY_EVENT ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════╪═════╪═══════╪══════════════╪════════════╪══════════════╡ + │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │ + │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │ + │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └────────────┴─────────────────────┴─────┴─────┴───────┴──────────────┴────────────┴──────────────┘ >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f: ... data_path = Path(f.name) ... ( @@ -505,17 +507,17 @@ def get_predicates_df(cfg: TaskExtractorConfig, data_config: DictConfig) -> pl.D ... ) ... data_config = DictConfig({"path": str(data_path), "standard": "direct", "ts_format": None}) ... get_predicates_df(config, data_config) - shape: (4, 7) - ┌────────────┬─────────────────────┬─────┬─────┬───────┬──────────────┬────────────┐ - │ subject_id ┆ timestamp ┆ adm ┆ dis ┆ death ┆ death_or_dis ┆ _ANY_EVENT │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════╪═════╪═══════╪══════════════╪════════════╡ - │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ - │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ - │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ - └────────────┴─────────────────────┴─────┴─────┴───────┴──────────────┴────────────┘ + shape: (4, 8) + ┌────────────┬─────────────────────┬─────┬─────┬───────┬──────────────┬────────────┬──────────────┐ + │ subject_id ┆ timestamp ┆ adm ┆ dis ┆ death ┆ death_or_dis ┆ _ANY_EVENT ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════╪═════╪═══════╪══════════════╪════════════╪══════════════╡ + │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │ + │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │ + │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └────────────┴─────────────────────┴─────┴─────┴───────┴──────────────┴────────────┴──────────────┘ >>> any_event_trigger = EventConfig("_ANY_EVENT") >>> adm_only_predicates = {"adm": PlainPredicateConfig("adm")} >>> st_end_windows = { @@ -636,4 +638,10 @@ def get_predicates_df(cfg: TaskExtractorConfig, data_config: DictConfig) -> pl.D logger.info(f"Added predicate column '{END_OF_RECORD_KEY}'.") predicate_cols += special_predicates + # create a column for event_id + data = data.with_columns(pl.lit(1).alias(EVENT_INDEX_COLUMN)) + data = data.with_columns( + (pl.col(EVENT_INDEX_COLUMN).cum_sum().over("subject_id") - 1).cast(EVENT_INDEX_TYPE) + ) + return data diff --git a/src/aces/types.py b/src/aces/types.py index 20db78c..d74fc0a 100644 --- a/src/aces/types.py +++ b/src/aces/types.py @@ -12,6 +12,9 @@ # The type used for final aggregate counts of predicates. PRED_CNT_TYPE = pl.Int64 +# The type used for event indexing +EVENT_INDEX_TYPE = pl.Int64 + # The key used in the endpoint expression to indicate the window should be aggregated to the record start. START_OF_RECORD_KEY = "_RECORD_START" END_OF_RECORD_KEY = "_RECORD_END" @@ -19,6 +22,10 @@ # The key used to capture the count of events of any kind that occur in a window. ANY_EVENT_COLUMN = "_ANY_EVENT" +# The key used for the event index +EVENT_INDEX_COLUMN = "_EVENT_INDEX" +LAST_EVENT_INDEX_COLUMN = "_LAST_EVENT_INDEX" + @dataclasses.dataclass(order=True) class TemporalWindowBounds: diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 41ce398..763feee 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -129,6 +129,7 @@ "window_name": "input.end", "timestamp_at_start": "01/27/1991 23:32", "timestamp_at_end": "01/28/1991 23:32", + "_LAST_EVENT_INDEX": 15, "admission": 0, "discharge": 0, "death": 0, @@ -139,6 +140,7 @@ "window_name": "input.end", "timestamp_at_start": "06/05/1996 00:32", "timestamp_at_end": "06/06/1996 00:32", + "_LAST_EVENT_INDEX": 7, "admission": 0, "discharge": 0, "death": 0, @@ -151,6 +153,7 @@ "window_name": "input.start", "timestamp_at_start": "12/01/1989 12:03", "timestamp_at_end": "01/28/1991 23:32", + "_LAST_EVENT_INDEX": 15, "admission": 2, "discharge": 1, "death": 0, @@ -161,6 +164,7 @@ "window_name": "input.start", "timestamp_at_start": "03/08/1996 02:24", "timestamp_at_end": "06/06/1996 00:32", + "_LAST_EVENT_INDEX": 7, "admission": 2, "discharge": 1, "death": 0, @@ -173,6 +177,7 @@ "window_name": "gap.end", "timestamp_at_start": "01/27/1991 23:32", "timestamp_at_end": "01/29/1991 23:32", + "_LAST_EVENT_INDEX": 16, "admission": 0, "discharge": 0, "death": 0, @@ -183,6 +188,7 @@ "window_name": "gap.end", "timestamp_at_start": "06/05/1996 00:32", "timestamp_at_end": "06/07/1996 00:32", + "_LAST_EVENT_INDEX": 7, "admission": 0, "discharge": 0, "death": 0, @@ -195,6 +201,7 @@ "window_name": "target.end", "timestamp_at_start": "01/29/1991 23:32", "timestamp_at_end": "01/31/1991 02:15", + "_LAST_EVENT_INDEX": 23, "admission": 0, "discharge": 1, "death": 0, @@ -205,6 +212,7 @@ "window_name": "target.end", "timestamp_at_start": "06/07/1996 00:32", "timestamp_at_end": "06/08/1996 03:00", + "_LAST_EVENT_INDEX": 12, "admission": 0, "discharge": 0, "death": 1, From b28e85b40b7952b974dee8a82e423f2ae20e4334 Mon Sep 17 00:00:00 2001 From: Justin Xu Date: Mon, 22 Jul 2024 02:52:24 +0100 Subject: [PATCH 2/8] Update doctests --- src/aces/aggregate.py | 113 +++++++++++++++++++++-------------------- src/aces/predicates.py | 22 ++++---- 2 files changed, 68 insertions(+), 67 deletions(-) diff --git a/src/aces/aggregate.py b/src/aces/aggregate.py index 769de1b..98c9052 100644 --- a/src/aces/aggregate.py +++ b/src/aces/aggregate.py @@ -82,69 +82,70 @@ def aggregate_temporal_window( ... "is_A": [1, 0, 1, 0, 0, 0], ... "is_B": [0, 1, 0, 1, 1, 0], ... "is_C": [1, 1, 0, 0, 1, 0], + ... "_EVENT_INDEX": [0, 1, 2, 3, 0, 1], ... }) >>> aggregate_temporal_window(df, TemporalWindowBounds( - ... True, timedelta(days=7), True, None)).drop("_EVENT_INDEX") + ... True, timedelta(days=7), True, None)).drop("timestamp") shape: (6, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-08 12:03:00 ┆ 2 ┆ 2 ┆ 2 │ - │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-02 05:17:00 ┆ 1989-12-09 05:17:00 ┆ 1 ┆ 2 ┆ 1 │ - │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-09 12:03:00 ┆ 1 ┆ 1 ┆ 0 │ - │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-06 11:00:00 ┆ 1989-12-13 11:00:00 ┆ 0 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 1989-12-08 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-10 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-08 12:03:00 ┆ 2 ┆ 2 ┆ 2 ┆ 3 │ + │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-09 05:17:00 ┆ 1 ┆ 2 ┆ 1 ┆ 3 │ + │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-09 12:03:00 ┆ 1 ┆ 1 ┆ 0 ┆ 3 │ + │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-13 11:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-08 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-10 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> aggregate_temporal_window(df, ( - ... True, timedelta(days=1), True, timedelta(days=0))).drop("_EVENT_INDEX") + ... True, timedelta(days=1), True, timedelta(days=0))).drop("timestamp") shape: (6, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 2 ┆ 1 ┆ 2 │ - │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-02 05:17:00 ┆ 1989-12-03 05:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-03 12:03:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-06 11:00:00 ┆ 1989-12-07 11:00:00 ┆ 0 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 1989-12-02 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 2 ┆ 1 ┆ 2 ┆ 2 │ + │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-03 05:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-03 12:03:00 ┆ 1 ┆ 0 ┆ 0 ┆ 2 │ + │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-07 11:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-02 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> aggregate_temporal_window(df, ( - ... True, timedelta(days=1), False, timedelta(days=0))).drop("_EVENT_INDEX") + ... True, timedelta(days=1), False, timedelta(days=0))).drop("timestamp") shape: (6, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1 ┆ 1 ┆ 2 │ - │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-02 05:17:00 ┆ 1989-12-03 05:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-03 12:03:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-06 11:00:00 ┆ 1989-12-07 11:00:00 ┆ 0 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 1989-12-02 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1 ┆ 1 ┆ 2 ┆ 1 │ + │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-03 05:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-03 12:03:00 ┆ 1 ┆ 0 ┆ 0 ┆ 2 │ + │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-07 11:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-02 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> aggregate_temporal_window(df, ( - ... False, timedelta(days=1), False, timedelta(days=0))).drop("_EVENT_INDEX") + ... False, timedelta(days=1), False, timedelta(days=0))).drop("timestamp") shape: (6, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-02 05:17:00 ┆ 1989-12-03 05:17:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-03 12:03:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-06 11:00:00 ┆ 1989-12-07 11:00:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 1989-12-02 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-03 05:17:00 ┆ 1 ┆ 0 ┆ 0 ┆ 2 │ + │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-03 12:03:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-07 11:00:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-02 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> aggregate_temporal_window(df, ( - ... False, timedelta(days=-1), False, timedelta(days=0))).drop("_EVENT_INDEX") + ... False, timedelta(days=-1), False, timedelta(days=0))).drop("timestamp") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -159,7 +160,7 @@ def aggregate_temporal_window( │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-02 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ >>> aggregate_temporal_window(df, ( - ... False, timedelta(hours=12), False, timedelta(hours=12))).drop("_EVENT_INDEX") + ... False, timedelta(hours=12), False, timedelta(hours=12))).drop("timestamp") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -178,7 +179,7 @@ def aggregate_temporal_window( >>> # the earliest event in the aggregation window, regardless of whether that is earlier than the >>> # timestamp of the row. >>> aggregate_temporal_window(df, ( - ... False, timedelta(days=-1), True, timedelta(days=1))).drop("_EVENT_INDEX") + ... False, timedelta(days=-1), True, timedelta(days=1))).drop("timestamp") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ @@ -193,7 +194,7 @@ def aggregate_temporal_window( │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ >>> aggregate_temporal_window(df, ( - ... True, timedelta(days=-1), False, timedelta(days=1))).drop("_EVENT_INDEX") + ... True, timedelta(days=-1), False, timedelta(days=1))).drop("timestamp") shape: (6, 7) ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ diff --git a/src/aces/predicates.py b/src/aces/predicates.py index 7976840..c66d74b 100644 --- a/src/aces/predicates.py +++ b/src/aces/predicates.py @@ -542,17 +542,17 @@ def get_predicates_df(cfg: TaskExtractorConfig, data_config: DictConfig) -> pl.D ... "path": str(data_path), "standard": "direct", "ts_format": "%m/%d/%Y %H:%M" ... }) ... get_predicates_df(st_end_config, data_config) - shape: (4, 6) - ┌────────────┬─────────────────────┬─────┬────────────┬───────────────┬─────────────┐ - │ subject_id ┆ timestamp ┆ adm ┆ _ANY_EVENT ┆ _RECORD_START ┆ _RECORD_END │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════╪════════════╪═══════════════╪═════════════╡ - │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ - │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ - │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ - │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ - └────────────┴─────────────────────┴─────┴────────────┴───────────────┴─────────────┘ + shape: (4, 7) + ┌────────────┬─────────────────────┬─────┬────────────┬───────────────┬─────────────┬──────────────┐ + │ subject_id ┆ timestamp ┆ adm ┆ _ANY_EVENT ┆ _RECORD_START ┆ _RECORD_END ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════╪════════════╪═══════════════╪═════════════╪══════════════╡ + │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 1 ┆ 1 ┆ 0 ┆ 0 │ + │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 1 │ + │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 1 ┆ 1 ┆ 0 ┆ 0 │ + │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 1 │ + └────────────┴─────────────────────┴─────┴────────────┴───────────────┴─────────────┴──────────────┘ >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f: ... data_path = Path(f.name) ... data.write_csv(data_path) From 69181db1d76d9a17cd32449aa1ce6bdbb02ab526 Mon Sep 17 00:00:00 2001 From: Justin Xu Date: Mon, 22 Jul 2024 08:45:00 +0100 Subject: [PATCH 3/8] Fix rename for MEDS --- src/aces/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aces/__main__.py b/src/aces/__main__.py index b0a4a9e..80f94ab 100644 --- a/src/aces/__main__.py +++ b/src/aces/__main__.py @@ -46,7 +46,7 @@ def main(cfg: DictConfig): result = query.query(task_cfg, predicates_df) if cfg.data.standard.lower() == "meds": - result = result.rename(columns={"subject_id": "patient_id"}) + result = result.rename({"subject_id": "patient_id"}) # save results to parquet os.makedirs(os.path.dirname(cfg.output_filepath), exist_ok=True) From 81c70ae43bc91d629b8bb6db6dfe47315b79cada Mon Sep 17 00:00:00 2001 From: Justin Xu Date: Mon, 22 Jul 2024 09:14:44 +0100 Subject: [PATCH 4/8] Fix more doctests --- src/aces/aggregate.py | 96 +++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/src/aces/aggregate.py b/src/aces/aggregate.py index 98c9052..8dcdd98 100644 --- a/src/aces/aggregate.py +++ b/src/aces/aggregate.py @@ -147,33 +147,33 @@ def aggregate_temporal_window( >>> aggregate_temporal_window(df, ( ... False, timedelta(days=-1), False, timedelta(days=0))).drop("timestamp") shape: (6, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-11-30 12:03:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-02 05:17:00 ┆ 1989-12-01 05:17:00 ┆ 1 ┆ 0 ┆ 1 │ - │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-06 11:00:00 ┆ 1989-12-05 11:00:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 1989-11-30 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-02 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-11-30 12:03:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-01 05:17:00 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-05 11:00:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-11-30 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-02 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> aggregate_temporal_window(df, ( ... False, timedelta(hours=12), False, timedelta(hours=12))).drop("timestamp") shape: (6, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-02 00:03:00 ┆ 1989-12-02 12:03:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-02 17:17:00 ┆ 1989-12-03 05:17:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-03 00:03:00 ┆ 1989-12-03 12:03:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-06 23:00:00 ┆ 1989-12-07 11:00:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-02 01:14:00 ┆ 1989-12-02 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 03:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-02 00:03:00 ┆ 1989-12-02 12:03:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-02 17:17:00 ┆ 1989-12-03 05:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ 1989-12-03 00:03:00 ┆ 1989-12-03 12:03:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ 1989-12-06 23:00:00 ┆ 1989-12-07 11:00:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-02 01:14:00 ┆ 1989-12-02 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 03:17:00 ┆ 1989-12-04 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> # Note that left_inclusive and right_inclusive are relative to the temporal ordering of the window >>> # and not the timestamp of the row. E.g., if left_inclusive is False, the window will not include >>> # the earliest event in the aggregation window, regardless of whether that is earlier than the @@ -181,33 +181,33 @@ def aggregate_temporal_window( >>> aggregate_temporal_window(df, ( ... False, timedelta(days=-1), True, timedelta(days=1))).drop("timestamp") shape: (6, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-03 05:17:00 ┆ 1989-12-02 05:17:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-03 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-07 11:00:00 ┆ 1989-12-06 11:00:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-02 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 1 ┆ 1989-12-03 05:17:00 ┆ 1989-12-02 05:17:00 ┆ 1 ┆ 0 ┆ 0 ┆ 2 │ + │ 1 ┆ 1989-12-03 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ 1989-12-07 11:00:00 ┆ 1989-12-06 11:00:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-02 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> aggregate_temporal_window(df, ( ... True, timedelta(days=-1), False, timedelta(days=1))).drop("timestamp") shape: (6, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1 ┆ 1 ┆ 2 │ - │ 1 ┆ 1989-12-02 05:17:00 ┆ 1989-12-03 05:17:00 ┆ 1989-12-02 05:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-03 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-06 11:00:00 ┆ 1989-12-07 11:00:00 ┆ 1989-12-06 11:00:00 ┆ 0 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-02 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-02 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1 ┆ 1 ┆ 2 ┆ 1 │ + │ 1 ┆ 1989-12-03 05:17:00 ┆ 1989-12-02 05:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 1 ┆ 1989-12-03 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1 ┆ 0 ┆ 0 ┆ 2 │ + │ 1 ┆ 1989-12-07 11:00:00 ┆ 1989-12-06 11:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1989-12-02 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ 2 ┆ 1989-12-04 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ """ if not isinstance(endpoint_expr, TemporalWindowBounds): endpoint_expr = TemporalWindowBounds(*endpoint_expr) From c36b5bddb1a2974d3db990877dcb0ab1d0a2cfc2 Mon Sep 17 00:00:00 2001 From: Justin Xu Date: Fri, 26 Jul 2024 08:15:30 +0100 Subject: [PATCH 5/8] Update test cases --- src/aces/aggregate.py | 610 ++++++++++++++++++------------------ src/aces/extract_subtree.py | 6 +- 2 files changed, 308 insertions(+), 308 deletions(-) diff --git a/src/aces/aggregate.py b/src/aces/aggregate.py index 8dcdd98..589d74e 100644 --- a/src/aces/aggregate.py +++ b/src/aces/aggregate.py @@ -312,79 +312,77 @@ def aggregate_event_bound_window( ... datetime(year=1989, month=12, day=8, hour=16, minute=22), ... datetime(year=1989, month=12, day=10, hour=3, minute=7), # HAS EVENT BOUND ... ], - ... "_EVENT_INDEX": [0, 1, 2, 0, 1, 2, 3, 4], ... "is_A": [1, 0, 1, 1, 1, 1, 0, 0], ... "is_B": [0, 1, 0, 1, 0, 1, 1, 1], ... "is_C": [0, 1, 0, 0, 0, 1, 0, 1], + ... "_EVENT_INDEX": [0, 1, 2, 0, 1, 2, 3, 4], ... }) - >>> aggregate_event_bound_window(df, ToEventWindowBounds(True, "is_C", True, None)).drop( - ... "_EVENT_INDEX") + >>> aggregate_event_bound_window(df, ToEventWindowBounds(True, "is_C", True, None)).drop("timestamp") shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-06 15:17:00 ┆ 3 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 2 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ - >>> aggregate_event_bound_window(df, ToEventWindowBounds(True, "is_C", False, None)).drop( - ... "_EVENT_INDEX") + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-06 15:17:00 ┆ 3 ┆ 2 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 2 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 ┆ 4 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ + >>> aggregate_event_bound_window(df, ToEventWindowBounds(True, "is_C", False, None)).drop("timestamp") shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-06 15:17:00 ┆ 2 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ - >>> aggregate_event_bound_window(df, ToEventWindowBounds(False, "is_C", True, None)).drop( - ... "_EVENT_INDEX") + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-06 15:17:00 ┆ 2 ┆ 1 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ + >>> # unsure about event index for windows where start==end and no event (st_inclusive=False), null/# + >>> aggregate_event_bound_window(df, ToEventWindowBounds(False, "is_C", True, None)).drop("timestamp") shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-06 15:17:00 ┆ 2 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-06 15:17:00 ┆ 2 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> aggregate_event_bound_window(df, ToEventWindowBounds( - ... True, "is_C", True, timedelta(days=3))).drop("_EVENT_INDEX") + ... True, "is_C", True, timedelta(days=3))).drop("timestamp") shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-05 12:03:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-09 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-05 12:03:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-07 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 ┆ 4 │ + │ 2 ┆ 1989-12-09 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ """ if not isinstance(endpoint_expr, ToEventWindowBounds): endpoint_expr = ToEventWindowBounds(*endpoint_expr) @@ -510,180 +508,180 @@ def boolean_expr_bound_sum( ... datetime(year=1989, month=12, day=8, hour=16, minute=22), ... datetime(year=1989, month=12, day=10, hour=3, minute=7), # HAS EVENT BOUND ... ], - ... "_EVENT_INDEX": [0, 1, 2, 0, 1, 2, 3, 4], ... "idx": [0, 1, 2, 3, 4, 5, 6, 7], ... "is_A": [1, 0, 1, 1, 1, 1, 0, 0], ... "is_B": [0, 1, 0, 1, 0, 1, 1, 1], ... "is_C": [0, 1, 0, 0, 0, 1, 0, 1], + ... "_EVENT_INDEX": [0, 1, 2, 0, 1, 2, 3, 4], ... }) >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "bound_to_row", ... "both", - ... ).drop("idx") + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-05 15:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 2 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-08 16:22:00 ┆ 2 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-05 15:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 2 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-08 16:22:00 ┆ 2 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "bound_to_row", ... "none", - ... ).drop("idx") + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-05 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-08 16:22:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-05 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-08 16:22:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 ┆ 3 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "bound_to_row", ... "left", - ... ).drop("idx") + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-05 15:17:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-08 16:22:00 ┆ 2 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-05 15:17:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-08 16:22:00 ┆ 2 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 ┆ 3 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "bound_to_row", ... "right", - ... ).drop("idx") + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-05 15:17:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-08 16:22:00 ┆ 1 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 3 ┆ 2 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-05 15:17:00 ┆ 1 ┆ 0 ┆ 0 ┆ 2 │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-06 15:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-08 16:22:00 ┆ 1 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 3 ┆ 2 ┆ 4 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "row_to_bound", ... "both", - ... ).drop("idx") + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 2 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 3 ┆ 2 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 2 ┆ 1 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 3 ┆ 2 ┆ 4 │ + │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 ┆ 4 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "row_to_bound", ... "none", - ... ).drop("idx") + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "row_to_bound", ... "left", - ... ).drop("idx") + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 2 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 1 ┆ 0 ┆ 0 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 2 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "row_to_bound", ... "right", - ... ).drop("idx") + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 ┆ 4 │ + │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> #### WITH OFFSET #### >>> boolean_expr_bound_sum( ... df, @@ -691,176 +689,176 @@ def boolean_expr_bound_sum( ... "bound_to_row", ... "both", ... offset = timedelta(days=3), - ... ).drop(["idx", "_EVENT_INDEX"]) + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-04 12:03:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-06 13:14:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-08 15:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-05 12:03:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 13:14:00 ┆ 2 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-09 15:17:00 ┆ 2 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-11 16:22:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-13 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-04 12:03:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-06 13:14:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-08 15:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-05 12:03:00 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 13:14:00 ┆ 2 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-09 15:17:00 ┆ 2 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-11 16:22:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-13 03:07:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "bound_to_row", ... "left", ... offset = timedelta(days=3), - ... ).drop(["idx", "_EVENT_INDEX"]) + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-04 12:03:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-06 13:14:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 1989-12-08 15:17:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-05 12:03:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 13:14:00 ┆ 2 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-09 15:17:00 ┆ 2 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-11 16:22:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 1989-12-13 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-04 12:03:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-06 13:14:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-12-08 15:17:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-05 12:03:00 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 13:14:00 ┆ 2 ┆ 1 ┆ 1 ┆ 2 │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-09 15:17:00 ┆ 2 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-11 16:22:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-13 03:07:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "bound_to_row", ... "none", ... timedelta(days=-3), - ... ).drop(["idx", "_EVENT_INDEX"]) + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-05 16:22:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 03:07:00 ┆ 1 ┆ 1 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-05 16:22:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 03:07:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "bound_to_row", ... "right", ... offset = timedelta(days=-3), - ... ).drop(["idx", "_EVENT_INDEX"]) + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-05 16:22:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 03:07:00 ┆ 1 ┆ 1 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-05 16:22:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 03:07:00 ┆ 1 ┆ 1 ┆ 1 ┆ 2 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "row_to_bound", ... "both", ... offset = timedelta(days=3), - ... ).drop(["idx", "_EVENT_INDEX"]) + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-05 12:03:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 3 ┆ 2 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-09 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-05 12:03:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 3 ┆ 2 ┆ 4 │ + │ 2 ┆ 1989-12-07 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 ┆ 4 │ + │ 2 ┆ 1989-12-09 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "row_to_bound", ... "left", ... offset = timedelta(days=3), - ... ).drop(["idx", "_EVENT_INDEX"]) + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-12-05 12:03:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-07 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-09 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 1 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-05 12:03:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-07 13:14:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1989-12-09 15:17:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ null ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ null │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "row_to_bound", ... "none", ... offset = timedelta(days=-3), - ... ).drop(["idx", "_EVENT_INDEX"]) + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-11-28 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-11-30 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ 1989-12-02 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-11-29 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 0 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-05 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-07 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-11-28 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1989-11-30 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1989-12-02 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-11-29 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 1 ┆ 0 ┆ 0 │ + │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 1 ┆ 0 ┆ 0 │ + │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 0 ┆ 0 ┆ 0 ┆ null │ + │ 2 ┆ 1989-12-05 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 2 ┆ 1 ┆ 3 │ + │ 2 ┆ 1989-12-07 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 1 ┆ 0 ┆ 3 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ >>> boolean_expr_bound_sum( ... df, ... pl.col("idx").is_in([1, 4, 7]), ... "row_to_bound", ... "right", ... offset = timedelta(days=-3), - ... ).drop(["idx", "_EVENT_INDEX"]) + ... ).drop(["idx", "timestamp"]) shape: (8, 7) - ┌────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┐ - │ subject_id ┆ timestamp ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╡ - │ 1 ┆ 1989-12-01 12:03:00 ┆ 1989-11-28 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-03 13:14:00 ┆ 1989-11-30 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 1 ┆ 1 │ - │ 1 ┆ 1989-12-05 15:17:00 ┆ 1989-12-02 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 │ - │ 2 ┆ 1989-12-02 12:03:00 ┆ 1989-11-29 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 2 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-04 13:14:00 ┆ 1989-12-01 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 2 ┆ 1 ┆ 0 │ - │ 2 ┆ 1989-12-06 15:17:00 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 0 ┆ 0 │ - │ 2 ┆ 1989-12-08 16:22:00 ┆ 1989-12-05 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 3 ┆ 2 │ - │ 2 ┆ 1989-12-10 03:07:00 ┆ 1989-12-07 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 │ - └────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┘ + ┌────────────┬─────────────────────┬─────────────────────┬──────┬──────┬──────┬──────────────┐ + │ subject_id ┆ timestamp_at_start ┆ timestamp_at_end ┆ is_A ┆ is_B ┆ is_C ┆ _EVENT_INDEX │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ datetime[μs] ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════════╪═════════════════════╪══════╪══════╪══════╪══════════════╡ + │ 1 ┆ 1989-11-28 12:03:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-11-30 13:14:00 ┆ 1989-12-03 13:14:00 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + │ 1 ┆ 1989-12-02 15:17:00 ┆ 1989-12-03 13:14:00 ┆ 0 ┆ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1989-11-29 12:03:00 ┆ 1989-12-04 13:14:00 ┆ 2 ┆ 1 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-01 13:14:00 ┆ 1989-12-04 13:14:00 ┆ 2 ┆ 1 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-03 15:17:00 ┆ 1989-12-04 13:14:00 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │ + │ 2 ┆ 1989-12-05 16:22:00 ┆ 1989-12-10 03:07:00 ┆ 1 ┆ 3 ┆ 2 ┆ 4 │ + │ 2 ┆ 1989-12-07 03:07:00 ┆ 1989-12-10 03:07:00 ┆ 0 ┆ 2 ┆ 1 ┆ 4 │ + └────────────┴─────────────────────┴─────────────────────┴──────┴──────┴──────┴──────────────┘ """ if mode not in ("bound_to_row", "row_to_bound"): raise ValueError(f"Mode '{mode}' invalid!") @@ -981,6 +979,7 @@ def boolean_expr_bound_sum( pl.col("timestamp_at_boundary").fill_null(strategy=fill_strategy).over("subject_id"), *sum_exprs.values(), "is_real", + EVENT_INDEX_COLUMN, ) .filter("is_real") .drop("is_real") @@ -1004,6 +1003,7 @@ def boolean_expr_bound_sum( st_timestamp_expr.alias("timestamp_at_start"), end_timestamp_expr.alias("timestamp_at_end"), *(pl.col(c).cast(PRED_CNT_TYPE).fill_null(0).alias(c) for c in cols), + EVENT_INDEX_COLUMN, ) if mode == "bound_to_row" and offset > timedelta(0): diff --git a/src/aces/extract_subtree.py b/src/aces/extract_subtree.py index bfb3592..1a5217c 100644 --- a/src/aces/extract_subtree.py +++ b/src/aces/extract_subtree.py @@ -8,7 +8,7 @@ from .aggregate import aggregate_event_bound_window, aggregate_temporal_window from .constraints import check_constraints -from .types import EVENT_INDEX_COLUMN, LAST_EVENT_INDEX_COLUMN +from .types import EVENT_INDEX_COLUMN def extract_subtree( @@ -144,7 +144,7 @@ def extract_subtree( >>> subtreee_anchor_realizations = ( ... predicates_df.filter(pl.col("is_admission") > 0) ... .rename({"timestamp": "subtree_anchor_timestamp"}) - ... ).select("subject_id", "subtree_anchor_timestamp") + ... ).select("subject_id", "subtree_anchor_timestamp", "_EVENT_INDEX") >>> print(subtreee_anchor_realizations) shape: (5, 3) ┌────────────┬──────────────────────────┬──────────────┐ @@ -330,7 +330,7 @@ def extract_subtree( pl.lit(child.name).alias("window_name"), "timestamp_at_start", "timestamp_at_end", - pl.col(EVENT_INDEX_COLUMN).alias(LAST_EVENT_INDEX_COLUMN), + pl.col(EVENT_INDEX_COLUMN), *predicate_cols, ).alias(f"{child.name}_summary"), ) From c8f578cd3089e219fc449a9efd900177ec3a45b0 Mon Sep 17 00:00:00 2001 From: Justin Xu Date: Fri, 2 Aug 2024 06:46:20 +0100 Subject: [PATCH 6/8] Modify index type and simplify index column creation --- src/aces/predicates.py | 4 +--- src/aces/types.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/aces/predicates.py b/src/aces/predicates.py index c66d74b..a6a82d8 100644 --- a/src/aces/predicates.py +++ b/src/aces/predicates.py @@ -639,9 +639,7 @@ def get_predicates_df(cfg: TaskExtractorConfig, data_config: DictConfig) -> pl.D predicate_cols += special_predicates # create a column for event_id - data = data.with_columns(pl.lit(1).alias(EVENT_INDEX_COLUMN)) data = data.with_columns( - (pl.col(EVENT_INDEX_COLUMN).cum_sum().over("subject_id") - 1).cast(EVENT_INDEX_TYPE) + pl.int_range(pl.len()).over("subject_id").cast(EVENT_INDEX_TYPE).alias(EVENT_INDEX_COLUMN) ) - return data diff --git a/src/aces/types.py b/src/aces/types.py index d74fc0a..5cdd70a 100644 --- a/src/aces/types.py +++ b/src/aces/types.py @@ -13,7 +13,7 @@ PRED_CNT_TYPE = pl.Int64 # The type used for event indexing -EVENT_INDEX_TYPE = pl.Int64 +EVENT_INDEX_TYPE = pl.UInt64 # The key used in the endpoint expression to indicate the window should be aggregated to the record start. START_OF_RECORD_KEY = "_RECORD_START" From 685c2aa923b3d96ada06e2a27d5c3c5d7174951f Mon Sep 17 00:00:00 2001 From: Justin Xu Date: Fri, 2 Aug 2024 06:48:21 +0100 Subject: [PATCH 7/8] Renamed to last event index column --- src/aces/extract_subtree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aces/extract_subtree.py b/src/aces/extract_subtree.py index 1a5217c..a6be0e7 100644 --- a/src/aces/extract_subtree.py +++ b/src/aces/extract_subtree.py @@ -8,7 +8,7 @@ from .aggregate import aggregate_event_bound_window, aggregate_temporal_window from .constraints import check_constraints -from .types import EVENT_INDEX_COLUMN +from .types import EVENT_INDEX_COLUMN, LAST_EVENT_INDEX_COLUMN def extract_subtree( @@ -330,7 +330,7 @@ def extract_subtree( pl.lit(child.name).alias("window_name"), "timestamp_at_start", "timestamp_at_end", - pl.col(EVENT_INDEX_COLUMN), + pl.col(LAST_EVENT_INDEX_COLUMN), *predicate_cols, ).alias(f"{child.name}_summary"), ) From 3a5a8b48a9ae21cde68816cf63bd2217fc45c3c6 Mon Sep 17 00:00:00 2001 From: Justin Xu Date: Fri, 2 Aug 2024 06:59:51 +0100 Subject: [PATCH 8/8] More renaming of event index columns --- src/aces/aggregate.py | 5 +++-- src/aces/extract_subtree.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/aces/aggregate.py b/src/aces/aggregate.py index 589d74e..f0e8f08 100644 --- a/src/aces/aggregate.py +++ b/src/aces/aggregate.py @@ -6,6 +6,7 @@ from .types import ( EVENT_INDEX_COLUMN, + LAST_EVENT_INDEX_COLUMN, PRED_CNT_TYPE, TemporalWindowBounds, ToEventWindowBounds, @@ -1003,7 +1004,7 @@ def boolean_expr_bound_sum( st_timestamp_expr.alias("timestamp_at_start"), end_timestamp_expr.alias("timestamp_at_end"), *(pl.col(c).cast(PRED_CNT_TYPE).fill_null(0).alias(c) for c in cols), - EVENT_INDEX_COLUMN, + pl.col(EVENT_INDEX_COLUMN).alias(LAST_EVENT_INDEX_COLUMN), ) if mode == "bound_to_row" and offset > timedelta(0): @@ -1040,5 +1041,5 @@ def agg_offset_fn(c: str) -> pl.Expr: st_timestamp_expr.alias("timestamp_at_start"), end_timestamp_expr.alias("timestamp_at_end"), *(agg_offset_fn(c).cast(PRED_CNT_TYPE, strict=False).fill_null(0).alias(c) for c in cols), - EVENT_INDEX_COLUMN, + pl.col(EVENT_INDEX_COLUMN).alias(LAST_EVENT_INDEX_COLUMN), ) diff --git a/src/aces/extract_subtree.py b/src/aces/extract_subtree.py index a6be0e7..84b42dd 100644 --- a/src/aces/extract_subtree.py +++ b/src/aces/extract_subtree.py @@ -330,7 +330,7 @@ def extract_subtree( pl.lit(child.name).alias("window_name"), "timestamp_at_start", "timestamp_at_end", - pl.col(LAST_EVENT_INDEX_COLUMN), + pl.col(EVENT_INDEX_COLUMN).alias(LAST_EVENT_INDEX_COLUMN), *predicate_cols, ).alias(f"{child.name}_summary"), )