Skip to content

Commit

Permalink
Automatically drop unneeded columns in choosers table (#833)
Browse files Browse the repository at this point in the history
* auto drop vars in choosers table

* formatting

* vars hardcoded in the custom chooser functions

* accidentally committed outputs

* skip dropping when tracing

* skip dropping if estimation mode

* test dropping before interaction df

* black

* revert drop inplace

* cleaning dup codes to function

* drop unused columns for logsum calculation

* blacken

* add drop_unused_columns to compute_settings

* check setting before dropping columns

* rename method; cherry-pick parking location changes from #849

* protect additional columns not in spec

---------

Co-authored-by: Jeff Newman <jeff@driftless.xyz>
  • Loading branch information
i-am-sijia and jpn-- authored Apr 21, 2024
1 parent 8e1fbcd commit 239f415
Show file tree
Hide file tree
Showing 11 changed files with 230 additions and 8 deletions.
11 changes: 10 additions & 1 deletion activitysim/abm/models/joint_tour_participation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
tracing,
workflow,
)
from activitysim.core.configuration.base import PreprocessorSettings
from activitysim.core.configuration.base import ComputeSettings, PreprocessorSettings
from activitysim.core.configuration.logit import LogitComponentSettings
from activitysim.core.util import assign_in_place, reindex

Expand Down Expand Up @@ -408,6 +408,15 @@ def joint_tour_participation(
)
candidates["chunk_id"] = reindex(household_chunk_ids, candidates.household_id)

# these hardcoded columns need to be protected from being dropped
assert model_settings is not None
if model_settings.compute_settings is None:
model_settings.compute_settings = ComputeSettings()
assert model_settings.compute_settings is not None
for i in ["person_is_preschool", "composition", "adult"]:
if i not in model_settings.compute_settings.protect_columns:
model_settings.compute_settings.protect_columns.append(i)

choices = simulate.simple_simulate_by_chunk_id(
state,
choosers=candidates,
Expand Down
22 changes: 16 additions & 6 deletions activitysim/abm/models/parking_location_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from activitysim.core.configuration.logit import LogitComponentSettings
from activitysim.core.interaction_sample_simulate import interaction_sample_simulate
from activitysim.core.tracing import print_elapsed_time
from activitysim.core.util import assign_in_place
from activitysim.core.util import assign_in_place, drop_unused_columns

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -106,6 +106,7 @@ def parking_destination_simulate(
destination_sample,
model_settings: ParkingLocationSettings,
skims,
locals_dict,
chunk_size,
trace_hh_id,
trace_label,
Expand All @@ -132,11 +133,6 @@ def parking_destination_simulate(

logger.info("Running parking_destination_simulate with %d trips", len(trips))

locals_dict = config.get_model_constants(model_settings).copy()
locals_dict.update(skims)
locals_dict["timeframe"] = "trip"
locals_dict["PARKING"] = skims["op_skims"].dest_key

parking_locations = interaction_sample_simulate(
state,
choosers=trips,
Expand Down Expand Up @@ -181,6 +177,19 @@ def choose_parking_location(
t0 = print_elapsed_time()

alt_dest_col_name = model_settings.ALT_DEST_COL_NAME

# remove trips and alts columns that are not used in spec
locals_dict = config.get_model_constants(model_settings).copy()
locals_dict.update(skims)
locals_dict["timeframe"] = "trip"
locals_dict["PARKING"] = skims["op_skims"].dest_key

spec = get_spec_for_segment(state, model_settings, segment_name)
trips = drop_unused_columns(trips, spec, locals_dict, custom_chooser=None)
alternatives = drop_unused_columns(
alternatives, spec, locals_dict, custom_chooser=None
)

destination_sample = logit.interaction_dataset(
state, trips, alternatives, alt_index_id=alt_dest_col_name
)
Expand All @@ -194,6 +203,7 @@ def choose_parking_location(
destination_sample=destination_sample,
model_settings=model_settings,
skims=skims,
locals_dict=locals_dict,
chunk_size=chunk_size,
trace_hh_id=trace_hh_id,
trace_label=trace_label,
Expand Down
1 change: 1 addition & 0 deletions activitysim/abm/models/util/logsums.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ def compute_location_choice_logsums(
chunk_size=chunk_size,
chunk_tag=chunk_tag,
trace_label=trace_label,
compute_settings=logsum_settings.compute_settings,
)

return logsums
1 change: 1 addition & 0 deletions activitysim/abm/models/util/tour_destination.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,7 @@ def run_destination_simulate(
trace_choice_name="destination",
estimator=estimator,
skip_choice=skip_choice,
compute_settings=model_settings.compute_settings,
)

if not want_logsums:
Expand Down
1 change: 1 addition & 0 deletions activitysim/abm/models/util/tour_od.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,6 +1057,7 @@ def run_od_simulate(
trace_label=trace_label,
trace_choice_name="origin_destination",
estimator=estimator,
compute_settings=model_settings.compute_settings,
)

if not want_logsums:
Expand Down
11 changes: 11 additions & 0 deletions activitysim/core/configuration/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,16 @@ class ComputeSettings(PydanticBase):
for more information.
"""

drop_unused_columns: bool = True
"""Drop unused columns in the choosers df.
Set to True or False to drop unused columns in data table for specific component.
Default to True. If set to False, all columns in the data table will be kept.
"""

protect_columns: list[str] = []
"""Protect these columns from being dropped from the chooser table."""

def should_skip(self, subcomponent: str) -> bool:
"""Check if sharrow should be skipped for a particular subcomponent."""
if isinstance(self.sharrow_skip, dict):
Expand Down Expand Up @@ -232,6 +242,7 @@ def subcomponent_settings(self, subcomponent: str) -> ComputeSettings:
use_bottleneck=self.use_bottleneck,
use_numexpr=self.use_numexpr,
use_numba=self.use_numba,
drop_unused_columns=self.drop_unused_columns,
)


Expand Down
19 changes: 19 additions & 0 deletions activitysim/core/interaction_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
simulate,
tracing,
workflow,
util,
)
from activitysim.core.configuration.base import ComputeSettings
from activitysim.core.skim_dataset import DatasetWrapper
Expand Down Expand Up @@ -240,6 +241,24 @@ def _interaction_sample(

interaction_utilities = None
interaction_utilities_sh = None

if compute_settings is None:
compute_settings = ComputeSettings()

# drop variables before the interaction dataframe is created

# check if tracing is enabled and if we have trace targets
# if not estimation mode, drop unused columns
if (not have_trace_targets) and (compute_settings.drop_unused_columns):

choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser=None,
sharrow_enabled=sharrow_enabled,
)

if sharrow_enabled:
(
interaction_utilities,
Expand Down
26 changes: 25 additions & 1 deletion activitysim/core/interaction_sample_simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import numpy as np
import pandas as pd

from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow

from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow, util
from activitysim.core.configuration.base import ComputeSettings
from activitysim.core.simulate import set_skim_wrapper_targets

Expand Down Expand Up @@ -136,6 +137,29 @@ def _interaction_sample_simulate(
logger.info(
f"{trace_label} start merging choosers and alternatives to create interaction_df"
)

# drop variables before the interaction dataframe is created
sharrow_enabled = state.settings.sharrow

if compute_settings is None:
compute_settings = ComputeSettings()

# check if tracing is enabled and if we have trace targets
# if not estimation mode, drop unused columns
if (
(not have_trace_targets)
and (estimator is None)
and (compute_settings.drop_unused_columns)
):

choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser=None,
sharrow_enabled=sharrow_enabled,
)

interaction_df = alternatives.join(choosers, how="left", rsuffix="_chooser")
logger.info(
f"{trace_label} end merging choosers and alternatives to create interaction_df"
Expand Down
22 changes: 22 additions & 0 deletions activitysim/core/interaction_simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pandas as pd

from . import chunk, config, logit, simulate, tracing, workflow
from activitysim.core import util
from .configuration.base import ComputeSettings

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -710,6 +711,27 @@ def _interaction_simulate(
sharrow_enabled = state.settings.sharrow
interaction_utilities = None

if compute_settings is None:
compute_settings = ComputeSettings()

# drop variables before the interaction dataframe is created

# check if tracing is enabled and if we have trace targets
# if not estimation mode, drop unused columns
if (
(not have_trace_targets)
and (estimator is None)
and (compute_settings.drop_unused_columns)
):

choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser=None,
sharrow_enabled=sharrow_enabled,
)

if (
sharrow_enabled
and skims is None
Expand Down
42 changes: 42 additions & 0 deletions activitysim/core/simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1528,6 +1528,31 @@ def _simple_simulate(
if skims is not None:
set_skim_wrapper_targets(choosers, skims)

# check if tracing is enabled and if we have trace targets
have_trace_targets = state.tracing.has_trace_targets(choosers)

sharrow_enabled = state.settings.sharrow

if compute_settings is None:
compute_settings = ComputeSettings()

# if tracing is not enabled, drop unused columns
# if not estimation mode, drop unused columns
if (
(not have_trace_targets)
and (estimator is None)
and (compute_settings.drop_unused_columns)
):
# drop unused variables in chooser table
choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser,
sharrow_enabled=sharrow_enabled,
additional_columns=compute_settings.protect_columns,
)

if nest_spec is None:
choices = eval_mnl(
state,
Expand Down Expand Up @@ -1949,6 +1974,23 @@ def _simple_simulate_logsums(
if skims is not None:
set_skim_wrapper_targets(choosers, skims)

# check if tracing is enabled and if we have trace targets
have_trace_targets = state.tracing.has_trace_targets(choosers)

if compute_settings is None:
compute_settings = ComputeSettings()

# if tracing is not enabled, drop unused columns
if (not have_trace_targets) and (compute_settings.drop_unused_columns):
# drop unused variables in chooser table
choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser=None,
sharrow_enabled=state.settings.sharrow,
)

if nest_spec is None:
logsums = eval_mnl_logsums(
state,
Expand Down
82 changes: 82 additions & 0 deletions activitysim/core/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,3 +638,85 @@ def zarr_file_modification_time(zarr_dir: Path):
if t == 0:
raise FileNotFoundError(zarr_dir)
return t


def drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser,
sharrow_enabled=False,
additional_columns=None,
):
"""
Drop unused columns from the chooser table, based on the spec and custom_chooser function.
"""
# keep only variables needed for spec
import re

# define a regular expression to find variables in spec
pattern = r"[a-zA-Z_][a-zA-Z0-9_]*"

unique_variables_in_spec = set(
spec.reset_index()["Expression"].apply(lambda x: re.findall(pattern, x)).sum()
)

unique_variables_in_spec |= set(additional_columns or [])

if locals_d:
unique_variables_in_spec.add(locals_d.get("orig_col_name", None))
unique_variables_in_spec.add(locals_d.get("dest_col_name", None))
if locals_d.get("timeframe") == "trip":
orig_col_name = locals_d.get("ORIGIN", None)
dest_col_name = locals_d.get("DESTINATION", None)
stop_col_name = None
parking_col_name = locals_d.get("PARKING", None)
primary_origin_col_name = None
if orig_col_name is None and "od_skims" in locals_d:
orig_col_name = locals_d["od_skims"].orig_key
if dest_col_name is None and "od_skims" in locals_d:
dest_col_name = locals_d["od_skims"].dest_key
if stop_col_name is None and "dp_skims" in locals_d:
stop_col_name = locals_d["dp_skims"].dest_key
if primary_origin_col_name is None and "dnt_skims" in locals_d:
primary_origin_col_name = locals_d["dnt_skims"].dest_key
unique_variables_in_spec.add(orig_col_name)
unique_variables_in_spec.add(dest_col_name)
unique_variables_in_spec.add(parking_col_name)
unique_variables_in_spec.add(primary_origin_col_name)
unique_variables_in_spec.add(stop_col_name)
unique_variables_in_spec.add("trip_period")
# when using trip_scheduling_choice for trup scheduling
unique_variables_in_spec.add("last_outbound_stop")
unique_variables_in_spec.add("last_inbound_stop")

# when sharrow mode, need to keep the following columns in the choosers table
if sharrow_enabled:
unique_variables_in_spec.add("out_period")
unique_variables_in_spec.add("in_period")
unique_variables_in_spec.add("purpose_index_num")

if custom_chooser:
import inspect

custom_chooser_lines = inspect.getsource(custom_chooser)
unique_variables_in_spec.update(re.findall(pattern, custom_chooser_lines))

logger.info("Dropping unused variables in chooser table")

logger.info(
"before dropping, the choosers table has {} columns: {}".format(
len(choosers.columns), choosers.columns
)
)

# keep only variables needed for spec
choosers = choosers[[c for c in choosers.columns if c in unique_variables_in_spec]]

logger.info(
"after dropping, the choosers table has {} columns: {}".format(
len(choosers.columns), choosers.columns
)
)

return choosers

0 comments on commit 239f415

Please sign in to comment.