From 239f4152e67399f16ae6f98704a55f78c9b0d60d Mon Sep 17 00:00:00 2001 From: Sijia Wang Date: Sun, 21 Apr 2024 18:16:32 -0400 Subject: [PATCH] Automatically drop unneeded columns in choosers table (#833) * auto drop vars in choosers table * formatting * vars hardcoded in the custom chooser functions * accidentally committed outputs * skip dropping when tracing * skip dropping if estimation mode * test dropping before interaction df * black * revert drop inplace * cleaning dup codes to function * drop unused columns for logsum calculation * blacken * add drop_unused_columns to compute_settings * check setting before dropping columns * rename method; cherry-pick parking location changes from https://github.com/ActivitySim/activitysim/pull/849 * protect additional columns not in spec --------- Co-authored-by: Jeff Newman --- .../abm/models/joint_tour_participation.py | 11 ++- .../abm/models/parking_location_choice.py | 22 +++-- activitysim/abm/models/util/logsums.py | 1 + .../abm/models/util/tour_destination.py | 1 + activitysim/abm/models/util/tour_od.py | 1 + activitysim/core/configuration/base.py | 11 +++ activitysim/core/interaction_sample.py | 19 +++++ .../core/interaction_sample_simulate.py | 26 +++++- activitysim/core/interaction_simulate.py | 22 +++++ activitysim/core/simulate.py | 42 ++++++++++ activitysim/core/util.py | 82 +++++++++++++++++++ 11 files changed, 230 insertions(+), 8 deletions(-) diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index 98a8c70c7..55d5367b3 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -18,7 +18,7 @@ tracing, workflow, ) -from activitysim.core.configuration.base import PreprocessorSettings +from activitysim.core.configuration.base import ComputeSettings, PreprocessorSettings from activitysim.core.configuration.logit import LogitComponentSettings from activitysim.core.util import assign_in_place, reindex @@ -408,6 +408,15 @@ def joint_tour_participation( ) candidates["chunk_id"] = reindex(household_chunk_ids, candidates.household_id) + # these hardcoded columns need to be protected from being dropped + assert model_settings is not None + if model_settings.compute_settings is None: + model_settings.compute_settings = ComputeSettings() + assert model_settings.compute_settings is not None + for i in ["person_is_preschool", "composition", "adult"]: + if i not in model_settings.compute_settings.protect_columns: + model_settings.compute_settings.protect_columns.append(i) + choices = simulate.simple_simulate_by_chunk_id( state, choosers=candidates, diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py index d594dcfa3..f40c5d9fe 100644 --- a/activitysim/abm/models/parking_location_choice.py +++ b/activitysim/abm/models/parking_location_choice.py @@ -21,7 +21,7 @@ from activitysim.core.configuration.logit import LogitComponentSettings from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.tracing import print_elapsed_time -from activitysim.core.util import assign_in_place +from activitysim.core.util import assign_in_place, drop_unused_columns logger = logging.getLogger(__name__) @@ -106,6 +106,7 @@ def parking_destination_simulate( destination_sample, model_settings: ParkingLocationSettings, skims, + locals_dict, chunk_size, trace_hh_id, trace_label, @@ -132,11 +133,6 @@ def parking_destination_simulate( logger.info("Running parking_destination_simulate with %d trips", len(trips)) - locals_dict = config.get_model_constants(model_settings).copy() - locals_dict.update(skims) - locals_dict["timeframe"] = "trip" - locals_dict["PARKING"] = skims["op_skims"].dest_key - parking_locations = interaction_sample_simulate( state, choosers=trips, @@ -181,6 +177,19 @@ def choose_parking_location( t0 = print_elapsed_time() alt_dest_col_name = model_settings.ALT_DEST_COL_NAME + + # remove trips and alts columns that are not used in spec + locals_dict = config.get_model_constants(model_settings).copy() + locals_dict.update(skims) + locals_dict["timeframe"] = "trip" + locals_dict["PARKING"] = skims["op_skims"].dest_key + + spec = get_spec_for_segment(state, model_settings, segment_name) + trips = drop_unused_columns(trips, spec, locals_dict, custom_chooser=None) + alternatives = drop_unused_columns( + alternatives, spec, locals_dict, custom_chooser=None + ) + destination_sample = logit.interaction_dataset( state, trips, alternatives, alt_index_id=alt_dest_col_name ) @@ -194,6 +203,7 @@ def choose_parking_location( destination_sample=destination_sample, model_settings=model_settings, skims=skims, + locals_dict=locals_dict, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label, diff --git a/activitysim/abm/models/util/logsums.py b/activitysim/abm/models/util/logsums.py index 328957965..82eef1c60 100644 --- a/activitysim/abm/models/util/logsums.py +++ b/activitysim/abm/models/util/logsums.py @@ -261,6 +261,7 @@ def compute_location_choice_logsums( chunk_size=chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, + compute_settings=logsum_settings.compute_settings, ) return logsums diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py index 22b0744da..751301d6a 100644 --- a/activitysim/abm/models/util/tour_destination.py +++ b/activitysim/abm/models/util/tour_destination.py @@ -780,6 +780,7 @@ def run_destination_simulate( trace_choice_name="destination", estimator=estimator, skip_choice=skip_choice, + compute_settings=model_settings.compute_settings, ) if not want_logsums: diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py index 89dc7fdc3..06a9364ff 100644 --- a/activitysim/abm/models/util/tour_od.py +++ b/activitysim/abm/models/util/tour_od.py @@ -1057,6 +1057,7 @@ def run_od_simulate( trace_label=trace_label, trace_choice_name="origin_destination", estimator=estimator, + compute_settings=model_settings.compute_settings, ) if not want_logsums: diff --git a/activitysim/core/configuration/base.py b/activitysim/core/configuration/base.py index 556dd9916..5b1cbc22f 100644 --- a/activitysim/core/configuration/base.py +++ b/activitysim/core/configuration/base.py @@ -201,6 +201,16 @@ class ComputeSettings(PydanticBase): for more information. """ + drop_unused_columns: bool = True + """Drop unused columns in the choosers df. + + Set to True or False to drop unused columns in data table for specific component. + Default to True. If set to False, all columns in the data table will be kept. + """ + + protect_columns: list[str] = [] + """Protect these columns from being dropped from the chooser table.""" + def should_skip(self, subcomponent: str) -> bool: """Check if sharrow should be skipped for a particular subcomponent.""" if isinstance(self.sharrow_skip, dict): @@ -232,6 +242,7 @@ def subcomponent_settings(self, subcomponent: str) -> ComputeSettings: use_bottleneck=self.use_bottleneck, use_numexpr=self.use_numexpr, use_numba=self.use_numba, + drop_unused_columns=self.drop_unused_columns, ) diff --git a/activitysim/core/interaction_sample.py b/activitysim/core/interaction_sample.py index 91ea04f84..5a05b7e8b 100644 --- a/activitysim/core/interaction_sample.py +++ b/activitysim/core/interaction_sample.py @@ -14,6 +14,7 @@ simulate, tracing, workflow, + util, ) from activitysim.core.configuration.base import ComputeSettings from activitysim.core.skim_dataset import DatasetWrapper @@ -240,6 +241,24 @@ def _interaction_sample( interaction_utilities = None interaction_utilities_sh = None + + if compute_settings is None: + compute_settings = ComputeSettings() + + # drop variables before the interaction dataframe is created + + # check if tracing is enabled and if we have trace targets + # if not estimation mode, drop unused columns + if (not have_trace_targets) and (compute_settings.drop_unused_columns): + + choosers = util.drop_unused_columns( + choosers, + spec, + locals_d, + custom_chooser=None, + sharrow_enabled=sharrow_enabled, + ) + if sharrow_enabled: ( interaction_utilities, diff --git a/activitysim/core/interaction_sample_simulate.py b/activitysim/core/interaction_sample_simulate.py index 5dcda88b5..3d729ad49 100644 --- a/activitysim/core/interaction_sample_simulate.py +++ b/activitysim/core/interaction_sample_simulate.py @@ -7,7 +7,8 @@ import numpy as np import pandas as pd -from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow + +from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow, util from activitysim.core.configuration.base import ComputeSettings from activitysim.core.simulate import set_skim_wrapper_targets @@ -136,6 +137,29 @@ def _interaction_sample_simulate( logger.info( f"{trace_label} start merging choosers and alternatives to create interaction_df" ) + + # drop variables before the interaction dataframe is created + sharrow_enabled = state.settings.sharrow + + if compute_settings is None: + compute_settings = ComputeSettings() + + # check if tracing is enabled and if we have trace targets + # if not estimation mode, drop unused columns + if ( + (not have_trace_targets) + and (estimator is None) + and (compute_settings.drop_unused_columns) + ): + + choosers = util.drop_unused_columns( + choosers, + spec, + locals_d, + custom_chooser=None, + sharrow_enabled=sharrow_enabled, + ) + interaction_df = alternatives.join(choosers, how="left", rsuffix="_chooser") logger.info( f"{trace_label} end merging choosers and alternatives to create interaction_df" diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index 12433056d..1c145ae6c 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -13,6 +13,7 @@ import pandas as pd from . import chunk, config, logit, simulate, tracing, workflow +from activitysim.core import util from .configuration.base import ComputeSettings logger = logging.getLogger(__name__) @@ -710,6 +711,27 @@ def _interaction_simulate( sharrow_enabled = state.settings.sharrow interaction_utilities = None + if compute_settings is None: + compute_settings = ComputeSettings() + + # drop variables before the interaction dataframe is created + + # check if tracing is enabled and if we have trace targets + # if not estimation mode, drop unused columns + if ( + (not have_trace_targets) + and (estimator is None) + and (compute_settings.drop_unused_columns) + ): + + choosers = util.drop_unused_columns( + choosers, + spec, + locals_d, + custom_chooser=None, + sharrow_enabled=sharrow_enabled, + ) + if ( sharrow_enabled and skims is None diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index 203c25668..d97b87a0d 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -1528,6 +1528,31 @@ def _simple_simulate( if skims is not None: set_skim_wrapper_targets(choosers, skims) + # check if tracing is enabled and if we have trace targets + have_trace_targets = state.tracing.has_trace_targets(choosers) + + sharrow_enabled = state.settings.sharrow + + if compute_settings is None: + compute_settings = ComputeSettings() + + # if tracing is not enabled, drop unused columns + # if not estimation mode, drop unused columns + if ( + (not have_trace_targets) + and (estimator is None) + and (compute_settings.drop_unused_columns) + ): + # drop unused variables in chooser table + choosers = util.drop_unused_columns( + choosers, + spec, + locals_d, + custom_chooser, + sharrow_enabled=sharrow_enabled, + additional_columns=compute_settings.protect_columns, + ) + if nest_spec is None: choices = eval_mnl( state, @@ -1949,6 +1974,23 @@ def _simple_simulate_logsums( if skims is not None: set_skim_wrapper_targets(choosers, skims) + # check if tracing is enabled and if we have trace targets + have_trace_targets = state.tracing.has_trace_targets(choosers) + + if compute_settings is None: + compute_settings = ComputeSettings() + + # if tracing is not enabled, drop unused columns + if (not have_trace_targets) and (compute_settings.drop_unused_columns): + # drop unused variables in chooser table + choosers = util.drop_unused_columns( + choosers, + spec, + locals_d, + custom_chooser=None, + sharrow_enabled=state.settings.sharrow, + ) + if nest_spec is None: logsums = eval_mnl_logsums( state, diff --git a/activitysim/core/util.py b/activitysim/core/util.py index 3130b18f5..eb874f284 100644 --- a/activitysim/core/util.py +++ b/activitysim/core/util.py @@ -638,3 +638,85 @@ def zarr_file_modification_time(zarr_dir: Path): if t == 0: raise FileNotFoundError(zarr_dir) return t + + +def drop_unused_columns( + choosers, + spec, + locals_d, + custom_chooser, + sharrow_enabled=False, + additional_columns=None, +): + """ + Drop unused columns from the chooser table, based on the spec and custom_chooser function. + """ + # keep only variables needed for spec + import re + + # define a regular expression to find variables in spec + pattern = r"[a-zA-Z_][a-zA-Z0-9_]*" + + unique_variables_in_spec = set( + spec.reset_index()["Expression"].apply(lambda x: re.findall(pattern, x)).sum() + ) + + unique_variables_in_spec |= set(additional_columns or []) + + if locals_d: + unique_variables_in_spec.add(locals_d.get("orig_col_name", None)) + unique_variables_in_spec.add(locals_d.get("dest_col_name", None)) + if locals_d.get("timeframe") == "trip": + orig_col_name = locals_d.get("ORIGIN", None) + dest_col_name = locals_d.get("DESTINATION", None) + stop_col_name = None + parking_col_name = locals_d.get("PARKING", None) + primary_origin_col_name = None + if orig_col_name is None and "od_skims" in locals_d: + orig_col_name = locals_d["od_skims"].orig_key + if dest_col_name is None and "od_skims" in locals_d: + dest_col_name = locals_d["od_skims"].dest_key + if stop_col_name is None and "dp_skims" in locals_d: + stop_col_name = locals_d["dp_skims"].dest_key + if primary_origin_col_name is None and "dnt_skims" in locals_d: + primary_origin_col_name = locals_d["dnt_skims"].dest_key + unique_variables_in_spec.add(orig_col_name) + unique_variables_in_spec.add(dest_col_name) + unique_variables_in_spec.add(parking_col_name) + unique_variables_in_spec.add(primary_origin_col_name) + unique_variables_in_spec.add(stop_col_name) + unique_variables_in_spec.add("trip_period") + # when using trip_scheduling_choice for trup scheduling + unique_variables_in_spec.add("last_outbound_stop") + unique_variables_in_spec.add("last_inbound_stop") + + # when sharrow mode, need to keep the following columns in the choosers table + if sharrow_enabled: + unique_variables_in_spec.add("out_period") + unique_variables_in_spec.add("in_period") + unique_variables_in_spec.add("purpose_index_num") + + if custom_chooser: + import inspect + + custom_chooser_lines = inspect.getsource(custom_chooser) + unique_variables_in_spec.update(re.findall(pattern, custom_chooser_lines)) + + logger.info("Dropping unused variables in chooser table") + + logger.info( + "before dropping, the choosers table has {} columns: {}".format( + len(choosers.columns), choosers.columns + ) + ) + + # keep only variables needed for spec + choosers = choosers[[c for c in choosers.columns if c in unique_variables_in_spec]] + + logger.info( + "after dropping, the choosers table has {} columns: {}".format( + len(choosers.columns), choosers.columns + ) + ) + + return choosers