From 239f4152e67399f16ae6f98704a55f78c9b0d60d Mon Sep 17 00:00:00 2001
From: Sijia Wang <wangsijia0628@gmail.com>
Date: Sun, 21 Apr 2024 18:16:32 -0400
Subject: [PATCH] Automatically drop unneeded columns in choosers table (#833)

* auto drop vars in choosers table

* formatting

* vars hardcoded in the custom chooser functions

* accidentally committed outputs

* skip dropping when tracing

* skip dropping if estimation mode

* test dropping before interaction df

* black

* revert drop inplace

* cleaning dup codes to function

* drop unused columns for logsum calculation

* blacken

* add drop_unused_columns to compute_settings

* check setting before dropping columns

* rename method; cherry-pick parking location changes from https://github.com/ActivitySim/activitysim/pull/849

* protect additional columns not in spec

---------

Co-authored-by: Jeff Newman <jeff@driftless.xyz>
---
 .../abm/models/joint_tour_participation.py    | 11 ++-
 .../abm/models/parking_location_choice.py     | 22 +++--
 activitysim/abm/models/util/logsums.py        |  1 +
 .../abm/models/util/tour_destination.py       |  1 +
 activitysim/abm/models/util/tour_od.py        |  1 +
 activitysim/core/configuration/base.py        | 11 +++
 activitysim/core/interaction_sample.py        | 19 +++++
 .../core/interaction_sample_simulate.py       | 26 +++++-
 activitysim/core/interaction_simulate.py      | 22 +++++
 activitysim/core/simulate.py                  | 42 ++++++++++
 activitysim/core/util.py                      | 82 +++++++++++++++++++
 11 files changed, 230 insertions(+), 8 deletions(-)

diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py
index 98a8c70c7..55d5367b3 100644
--- a/activitysim/abm/models/joint_tour_participation.py
+++ b/activitysim/abm/models/joint_tour_participation.py
@@ -18,7 +18,7 @@
     tracing,
     workflow,
 )
-from activitysim.core.configuration.base import PreprocessorSettings
+from activitysim.core.configuration.base import ComputeSettings, PreprocessorSettings
 from activitysim.core.configuration.logit import LogitComponentSettings
 from activitysim.core.util import assign_in_place, reindex
 
@@ -408,6 +408,15 @@ def joint_tour_participation(
     )
     candidates["chunk_id"] = reindex(household_chunk_ids, candidates.household_id)
 
+    # these hardcoded columns need to be protected from being dropped
+    assert model_settings is not None
+    if model_settings.compute_settings is None:
+        model_settings.compute_settings = ComputeSettings()
+    assert model_settings.compute_settings is not None
+    for i in ["person_is_preschool", "composition", "adult"]:
+        if i not in model_settings.compute_settings.protect_columns:
+            model_settings.compute_settings.protect_columns.append(i)
+
     choices = simulate.simple_simulate_by_chunk_id(
         state,
         choosers=candidates,
diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py
index d594dcfa3..f40c5d9fe 100644
--- a/activitysim/abm/models/parking_location_choice.py
+++ b/activitysim/abm/models/parking_location_choice.py
@@ -21,7 +21,7 @@
 from activitysim.core.configuration.logit import LogitComponentSettings
 from activitysim.core.interaction_sample_simulate import interaction_sample_simulate
 from activitysim.core.tracing import print_elapsed_time
-from activitysim.core.util import assign_in_place
+from activitysim.core.util import assign_in_place, drop_unused_columns
 
 logger = logging.getLogger(__name__)
 
@@ -106,6 +106,7 @@ def parking_destination_simulate(
     destination_sample,
     model_settings: ParkingLocationSettings,
     skims,
+    locals_dict,
     chunk_size,
     trace_hh_id,
     trace_label,
@@ -132,11 +133,6 @@ def parking_destination_simulate(
 
     logger.info("Running parking_destination_simulate with %d trips", len(trips))
 
-    locals_dict = config.get_model_constants(model_settings).copy()
-    locals_dict.update(skims)
-    locals_dict["timeframe"] = "trip"
-    locals_dict["PARKING"] = skims["op_skims"].dest_key
-
     parking_locations = interaction_sample_simulate(
         state,
         choosers=trips,
@@ -181,6 +177,19 @@ def choose_parking_location(
     t0 = print_elapsed_time()
 
     alt_dest_col_name = model_settings.ALT_DEST_COL_NAME
+
+    # remove trips and alts columns that are not used in spec
+    locals_dict = config.get_model_constants(model_settings).copy()
+    locals_dict.update(skims)
+    locals_dict["timeframe"] = "trip"
+    locals_dict["PARKING"] = skims["op_skims"].dest_key
+
+    spec = get_spec_for_segment(state, model_settings, segment_name)
+    trips = drop_unused_columns(trips, spec, locals_dict, custom_chooser=None)
+    alternatives = drop_unused_columns(
+        alternatives, spec, locals_dict, custom_chooser=None
+    )
+
     destination_sample = logit.interaction_dataset(
         state, trips, alternatives, alt_index_id=alt_dest_col_name
     )
@@ -194,6 +203,7 @@ def choose_parking_location(
         destination_sample=destination_sample,
         model_settings=model_settings,
         skims=skims,
+        locals_dict=locals_dict,
         chunk_size=chunk_size,
         trace_hh_id=trace_hh_id,
         trace_label=trace_label,
diff --git a/activitysim/abm/models/util/logsums.py b/activitysim/abm/models/util/logsums.py
index 328957965..82eef1c60 100644
--- a/activitysim/abm/models/util/logsums.py
+++ b/activitysim/abm/models/util/logsums.py
@@ -261,6 +261,7 @@ def compute_location_choice_logsums(
         chunk_size=chunk_size,
         chunk_tag=chunk_tag,
         trace_label=trace_label,
+        compute_settings=logsum_settings.compute_settings,
     )
 
     return logsums
diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py
index 22b0744da..751301d6a 100644
--- a/activitysim/abm/models/util/tour_destination.py
+++ b/activitysim/abm/models/util/tour_destination.py
@@ -780,6 +780,7 @@ def run_destination_simulate(
         trace_choice_name="destination",
         estimator=estimator,
         skip_choice=skip_choice,
+        compute_settings=model_settings.compute_settings,
     )
 
     if not want_logsums:
diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py
index 89dc7fdc3..06a9364ff 100644
--- a/activitysim/abm/models/util/tour_od.py
+++ b/activitysim/abm/models/util/tour_od.py
@@ -1057,6 +1057,7 @@ def run_od_simulate(
         trace_label=trace_label,
         trace_choice_name="origin_destination",
         estimator=estimator,
+        compute_settings=model_settings.compute_settings,
     )
 
     if not want_logsums:
diff --git a/activitysim/core/configuration/base.py b/activitysim/core/configuration/base.py
index 556dd9916..5b1cbc22f 100644
--- a/activitysim/core/configuration/base.py
+++ b/activitysim/core/configuration/base.py
@@ -201,6 +201,16 @@ class ComputeSettings(PydanticBase):
     for more information.
     """
 
+    drop_unused_columns: bool = True
+    """Drop unused columns in the choosers df.
+
+    Set to True or False to drop unused columns in data table for specific component.
+    Default to True. If set to False, all columns in the data table will be kept.
+    """
+
+    protect_columns: list[str] = []
+    """Protect these columns from being dropped from the chooser table."""
+
     def should_skip(self, subcomponent: str) -> bool:
         """Check if sharrow should be skipped for a particular subcomponent."""
         if isinstance(self.sharrow_skip, dict):
@@ -232,6 +242,7 @@ def subcomponent_settings(self, subcomponent: str) -> ComputeSettings:
             use_bottleneck=self.use_bottleneck,
             use_numexpr=self.use_numexpr,
             use_numba=self.use_numba,
+            drop_unused_columns=self.drop_unused_columns,
         )
 
 
diff --git a/activitysim/core/interaction_sample.py b/activitysim/core/interaction_sample.py
index 91ea04f84..5a05b7e8b 100644
--- a/activitysim/core/interaction_sample.py
+++ b/activitysim/core/interaction_sample.py
@@ -14,6 +14,7 @@
     simulate,
     tracing,
     workflow,
+    util,
 )
 from activitysim.core.configuration.base import ComputeSettings
 from activitysim.core.skim_dataset import DatasetWrapper
@@ -240,6 +241,24 @@ def _interaction_sample(
 
     interaction_utilities = None
     interaction_utilities_sh = None
+
+    if compute_settings is None:
+        compute_settings = ComputeSettings()
+
+    # drop variables before the interaction dataframe is created
+
+    # check if tracing is enabled and if we have trace targets
+    # if not estimation mode, drop unused columns
+    if (not have_trace_targets) and (compute_settings.drop_unused_columns):
+
+        choosers = util.drop_unused_columns(
+            choosers,
+            spec,
+            locals_d,
+            custom_chooser=None,
+            sharrow_enabled=sharrow_enabled,
+        )
+
     if sharrow_enabled:
         (
             interaction_utilities,
diff --git a/activitysim/core/interaction_sample_simulate.py b/activitysim/core/interaction_sample_simulate.py
index 5dcda88b5..3d729ad49 100644
--- a/activitysim/core/interaction_sample_simulate.py
+++ b/activitysim/core/interaction_sample_simulate.py
@@ -7,7 +7,8 @@
 import numpy as np
 import pandas as pd
 
-from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow
+
+from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow, util
 from activitysim.core.configuration.base import ComputeSettings
 from activitysim.core.simulate import set_skim_wrapper_targets
 
@@ -136,6 +137,29 @@ def _interaction_sample_simulate(
     logger.info(
         f"{trace_label} start merging choosers and alternatives to create interaction_df"
     )
+
+    # drop variables before the interaction dataframe is created
+    sharrow_enabled = state.settings.sharrow
+
+    if compute_settings is None:
+        compute_settings = ComputeSettings()
+
+    # check if tracing is enabled and if we have trace targets
+    # if not estimation mode, drop unused columns
+    if (
+        (not have_trace_targets)
+        and (estimator is None)
+        and (compute_settings.drop_unused_columns)
+    ):
+
+        choosers = util.drop_unused_columns(
+            choosers,
+            spec,
+            locals_d,
+            custom_chooser=None,
+            sharrow_enabled=sharrow_enabled,
+        )
+
     interaction_df = alternatives.join(choosers, how="left", rsuffix="_chooser")
     logger.info(
         f"{trace_label} end merging choosers and alternatives to create interaction_df"
diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py
index 12433056d..1c145ae6c 100644
--- a/activitysim/core/interaction_simulate.py
+++ b/activitysim/core/interaction_simulate.py
@@ -13,6 +13,7 @@
 import pandas as pd
 
 from . import chunk, config, logit, simulate, tracing, workflow
+from activitysim.core import util
 from .configuration.base import ComputeSettings
 
 logger = logging.getLogger(__name__)
@@ -710,6 +711,27 @@ def _interaction_simulate(
         sharrow_enabled = state.settings.sharrow
     interaction_utilities = None
 
+    if compute_settings is None:
+        compute_settings = ComputeSettings()
+
+    # drop variables before the interaction dataframe is created
+
+    # check if tracing is enabled and if we have trace targets
+    # if not estimation mode, drop unused columns
+    if (
+        (not have_trace_targets)
+        and (estimator is None)
+        and (compute_settings.drop_unused_columns)
+    ):
+
+        choosers = util.drop_unused_columns(
+            choosers,
+            spec,
+            locals_d,
+            custom_chooser=None,
+            sharrow_enabled=sharrow_enabled,
+        )
+
     if (
         sharrow_enabled
         and skims is None
diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py
index 203c25668..d97b87a0d 100644
--- a/activitysim/core/simulate.py
+++ b/activitysim/core/simulate.py
@@ -1528,6 +1528,31 @@ def _simple_simulate(
     if skims is not None:
         set_skim_wrapper_targets(choosers, skims)
 
+    # check if tracing is enabled and if we have trace targets
+    have_trace_targets = state.tracing.has_trace_targets(choosers)
+
+    sharrow_enabled = state.settings.sharrow
+
+    if compute_settings is None:
+        compute_settings = ComputeSettings()
+
+    # if tracing is not enabled, drop unused columns
+    # if not estimation mode, drop unused columns
+    if (
+        (not have_trace_targets)
+        and (estimator is None)
+        and (compute_settings.drop_unused_columns)
+    ):
+        # drop unused variables in chooser table
+        choosers = util.drop_unused_columns(
+            choosers,
+            spec,
+            locals_d,
+            custom_chooser,
+            sharrow_enabled=sharrow_enabled,
+            additional_columns=compute_settings.protect_columns,
+        )
+
     if nest_spec is None:
         choices = eval_mnl(
             state,
@@ -1949,6 +1974,23 @@ def _simple_simulate_logsums(
     if skims is not None:
         set_skim_wrapper_targets(choosers, skims)
 
+    # check if tracing is enabled and if we have trace targets
+    have_trace_targets = state.tracing.has_trace_targets(choosers)
+
+    if compute_settings is None:
+        compute_settings = ComputeSettings()
+
+    # if tracing is not enabled, drop unused columns
+    if (not have_trace_targets) and (compute_settings.drop_unused_columns):
+        # drop unused variables in chooser table
+        choosers = util.drop_unused_columns(
+            choosers,
+            spec,
+            locals_d,
+            custom_chooser=None,
+            sharrow_enabled=state.settings.sharrow,
+        )
+
     if nest_spec is None:
         logsums = eval_mnl_logsums(
             state,
diff --git a/activitysim/core/util.py b/activitysim/core/util.py
index 3130b18f5..eb874f284 100644
--- a/activitysim/core/util.py
+++ b/activitysim/core/util.py
@@ -638,3 +638,85 @@ def zarr_file_modification_time(zarr_dir: Path):
     if t == 0:
         raise FileNotFoundError(zarr_dir)
     return t
+
+
+def drop_unused_columns(
+    choosers,
+    spec,
+    locals_d,
+    custom_chooser,
+    sharrow_enabled=False,
+    additional_columns=None,
+):
+    """
+    Drop unused columns from the chooser table, based on the spec and custom_chooser function.
+    """
+    # keep only variables needed for spec
+    import re
+
+    # define a regular expression to find variables in spec
+    pattern = r"[a-zA-Z_][a-zA-Z0-9_]*"
+
+    unique_variables_in_spec = set(
+        spec.reset_index()["Expression"].apply(lambda x: re.findall(pattern, x)).sum()
+    )
+
+    unique_variables_in_spec |= set(additional_columns or [])
+
+    if locals_d:
+        unique_variables_in_spec.add(locals_d.get("orig_col_name", None))
+        unique_variables_in_spec.add(locals_d.get("dest_col_name", None))
+        if locals_d.get("timeframe") == "trip":
+            orig_col_name = locals_d.get("ORIGIN", None)
+            dest_col_name = locals_d.get("DESTINATION", None)
+            stop_col_name = None
+            parking_col_name = locals_d.get("PARKING", None)
+            primary_origin_col_name = None
+            if orig_col_name is None and "od_skims" in locals_d:
+                orig_col_name = locals_d["od_skims"].orig_key
+            if dest_col_name is None and "od_skims" in locals_d:
+                dest_col_name = locals_d["od_skims"].dest_key
+            if stop_col_name is None and "dp_skims" in locals_d:
+                stop_col_name = locals_d["dp_skims"].dest_key
+            if primary_origin_col_name is None and "dnt_skims" in locals_d:
+                primary_origin_col_name = locals_d["dnt_skims"].dest_key
+            unique_variables_in_spec.add(orig_col_name)
+            unique_variables_in_spec.add(dest_col_name)
+            unique_variables_in_spec.add(parking_col_name)
+            unique_variables_in_spec.add(primary_origin_col_name)
+            unique_variables_in_spec.add(stop_col_name)
+            unique_variables_in_spec.add("trip_period")
+        # when using trip_scheduling_choice for trup scheduling
+        unique_variables_in_spec.add("last_outbound_stop")
+        unique_variables_in_spec.add("last_inbound_stop")
+
+    # when sharrow mode, need to keep the following columns in the choosers table
+    if sharrow_enabled:
+        unique_variables_in_spec.add("out_period")
+        unique_variables_in_spec.add("in_period")
+        unique_variables_in_spec.add("purpose_index_num")
+
+    if custom_chooser:
+        import inspect
+
+        custom_chooser_lines = inspect.getsource(custom_chooser)
+        unique_variables_in_spec.update(re.findall(pattern, custom_chooser_lines))
+
+    logger.info("Dropping unused variables in chooser table")
+
+    logger.info(
+        "before dropping, the choosers table has {} columns: {}".format(
+            len(choosers.columns), choosers.columns
+        )
+    )
+
+    # keep only variables needed for spec
+    choosers = choosers[[c for c in choosers.columns if c in unique_variables_in_spec]]
+
+    logger.info(
+        "after dropping, the choosers table has {} columns: {}".format(
+            len(choosers.columns), choosers.columns
+        )
+    )
+
+    return choosers