Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically drop unneeded columns in choosers table #833

Merged
merged 19 commits into from
Apr 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion activitysim/abm/models/joint_tour_participation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
tracing,
workflow,
)
from activitysim.core.configuration.base import PreprocessorSettings
from activitysim.core.configuration.base import ComputeSettings, PreprocessorSettings
from activitysim.core.configuration.logit import LogitComponentSettings
from activitysim.core.util import assign_in_place, reindex

Expand Down Expand Up @@ -408,6 +408,15 @@ def joint_tour_participation(
)
candidates["chunk_id"] = reindex(household_chunk_ids, candidates.household_id)

# these hardcoded columns need to be protected from being dropped
assert model_settings is not None
if model_settings.compute_settings is None:
model_settings.compute_settings = ComputeSettings()
assert model_settings.compute_settings is not None
for i in ["person_is_preschool", "composition", "adult"]:
if i not in model_settings.compute_settings.protect_columns:
model_settings.compute_settings.protect_columns.append(i)

choices = simulate.simple_simulate_by_chunk_id(
state,
choosers=candidates,
Expand Down
22 changes: 16 additions & 6 deletions activitysim/abm/models/parking_location_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from activitysim.core.configuration.logit import LogitComponentSettings
from activitysim.core.interaction_sample_simulate import interaction_sample_simulate
from activitysim.core.tracing import print_elapsed_time
from activitysim.core.util import assign_in_place
from activitysim.core.util import assign_in_place, drop_unused_columns

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -106,6 +106,7 @@ def parking_destination_simulate(
destination_sample,
model_settings: ParkingLocationSettings,
skims,
locals_dict,
chunk_size,
trace_hh_id,
trace_label,
Expand All @@ -132,11 +133,6 @@ def parking_destination_simulate(

logger.info("Running parking_destination_simulate with %d trips", len(trips))

locals_dict = config.get_model_constants(model_settings).copy()
locals_dict.update(skims)
locals_dict["timeframe"] = "trip"
locals_dict["PARKING"] = skims["op_skims"].dest_key

parking_locations = interaction_sample_simulate(
state,
choosers=trips,
Expand Down Expand Up @@ -181,6 +177,19 @@ def choose_parking_location(
t0 = print_elapsed_time()

alt_dest_col_name = model_settings.ALT_DEST_COL_NAME

# remove trips and alts columns that are not used in spec
locals_dict = config.get_model_constants(model_settings).copy()
locals_dict.update(skims)
locals_dict["timeframe"] = "trip"
locals_dict["PARKING"] = skims["op_skims"].dest_key

spec = get_spec_for_segment(state, model_settings, segment_name)
trips = drop_unused_columns(trips, spec, locals_dict, custom_chooser=None)
alternatives = drop_unused_columns(
alternatives, spec, locals_dict, custom_chooser=None
)

destination_sample = logit.interaction_dataset(
state, trips, alternatives, alt_index_id=alt_dest_col_name
)
Expand All @@ -194,6 +203,7 @@ def choose_parking_location(
destination_sample=destination_sample,
model_settings=model_settings,
skims=skims,
locals_dict=locals_dict,
chunk_size=chunk_size,
trace_hh_id=trace_hh_id,
trace_label=trace_label,
Expand Down
1 change: 1 addition & 0 deletions activitysim/abm/models/util/logsums.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ def compute_location_choice_logsums(
chunk_size=chunk_size,
chunk_tag=chunk_tag,
trace_label=trace_label,
compute_settings=logsum_settings.compute_settings,
)

return logsums
1 change: 1 addition & 0 deletions activitysim/abm/models/util/tour_destination.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,7 @@ def run_destination_simulate(
trace_choice_name="destination",
estimator=estimator,
skip_choice=skip_choice,
compute_settings=model_settings.compute_settings,
)

if not want_logsums:
Expand Down
1 change: 1 addition & 0 deletions activitysim/abm/models/util/tour_od.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,6 +1057,7 @@ def run_od_simulate(
trace_label=trace_label,
trace_choice_name="origin_destination",
estimator=estimator,
compute_settings=model_settings.compute_settings,
)

if not want_logsums:
Expand Down
11 changes: 11 additions & 0 deletions activitysim/core/configuration/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,16 @@ class ComputeSettings(PydanticBase):
for more information.
"""

drop_unused_columns: bool = True
"""Drop unused columns in the choosers df.

Set to True or False to drop unused columns in data table for specific component.
Default to True. If set to False, all columns in the data table will be kept.
"""

protect_columns: list[str] = []
"""Protect these columns from being dropped from the chooser table."""

def should_skip(self, subcomponent: str) -> bool:
"""Check if sharrow should be skipped for a particular subcomponent."""
if isinstance(self.sharrow_skip, dict):
Expand Down Expand Up @@ -232,6 +242,7 @@ def subcomponent_settings(self, subcomponent: str) -> ComputeSettings:
use_bottleneck=self.use_bottleneck,
use_numexpr=self.use_numexpr,
use_numba=self.use_numba,
drop_unused_columns=self.drop_unused_columns,
)


Expand Down
19 changes: 19 additions & 0 deletions activitysim/core/interaction_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
simulate,
tracing,
workflow,
util,
)
from activitysim.core.configuration.base import ComputeSettings
from activitysim.core.skim_dataset import DatasetWrapper
Expand Down Expand Up @@ -240,6 +241,24 @@ def _interaction_sample(

interaction_utilities = None
interaction_utilities_sh = None

if compute_settings is None:
compute_settings = ComputeSettings()

# drop variables before the interaction dataframe is created

# check if tracing is enabled and if we have trace targets
# if not estimation mode, drop unused columns
if (not have_trace_targets) and (compute_settings.drop_unused_columns):

choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser=None,
sharrow_enabled=sharrow_enabled,
)

if sharrow_enabled:
(
interaction_utilities,
Expand Down
26 changes: 25 additions & 1 deletion activitysim/core/interaction_sample_simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import numpy as np
import pandas as pd

from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow

from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow, util
from activitysim.core.configuration.base import ComputeSettings
from activitysim.core.simulate import set_skim_wrapper_targets

Expand Down Expand Up @@ -136,6 +137,29 @@ def _interaction_sample_simulate(
logger.info(
f"{trace_label} start merging choosers and alternatives to create interaction_df"
)

# drop variables before the interaction dataframe is created
sharrow_enabled = state.settings.sharrow

if compute_settings is None:
compute_settings = ComputeSettings()

# check if tracing is enabled and if we have trace targets
# if not estimation mode, drop unused columns
if (
(not have_trace_targets)
and (estimator is None)
and (compute_settings.drop_unused_columns)
):

choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser=None,
sharrow_enabled=sharrow_enabled,
)

interaction_df = alternatives.join(choosers, how="left", rsuffix="_chooser")
logger.info(
f"{trace_label} end merging choosers and alternatives to create interaction_df"
Expand Down
22 changes: 22 additions & 0 deletions activitysim/core/interaction_simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pandas as pd

from . import chunk, config, logit, simulate, tracing, workflow
from activitysim.core import util
from .configuration.base import ComputeSettings

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -710,6 +711,27 @@ def _interaction_simulate(
sharrow_enabled = state.settings.sharrow
interaction_utilities = None

if compute_settings is None:
compute_settings = ComputeSettings()

# drop variables before the interaction dataframe is created

# check if tracing is enabled and if we have trace targets
# if not estimation mode, drop unused columns
if (
(not have_trace_targets)
and (estimator is None)
and (compute_settings.drop_unused_columns)
):

choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser=None,
sharrow_enabled=sharrow_enabled,
)

if (
sharrow_enabled
and skims is None
Expand Down
42 changes: 42 additions & 0 deletions activitysim/core/simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1528,6 +1528,31 @@ def _simple_simulate(
if skims is not None:
set_skim_wrapper_targets(choosers, skims)

# check if tracing is enabled and if we have trace targets
have_trace_targets = state.tracing.has_trace_targets(choosers)

sharrow_enabled = state.settings.sharrow

if compute_settings is None:
compute_settings = ComputeSettings()

# if tracing is not enabled, drop unused columns
# if not estimation mode, drop unused columns
if (
(not have_trace_targets)
and (estimator is None)
and (compute_settings.drop_unused_columns)
):
# drop unused variables in chooser table
choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser,
sharrow_enabled=sharrow_enabled,
additional_columns=compute_settings.protect_columns,
)

if nest_spec is None:
choices = eval_mnl(
state,
Expand Down Expand Up @@ -1949,6 +1974,23 @@ def _simple_simulate_logsums(
if skims is not None:
set_skim_wrapper_targets(choosers, skims)

# check if tracing is enabled and if we have trace targets
have_trace_targets = state.tracing.has_trace_targets(choosers)

if compute_settings is None:
compute_settings = ComputeSettings()

# if tracing is not enabled, drop unused columns
if (not have_trace_targets) and (compute_settings.drop_unused_columns):
# drop unused variables in chooser table
choosers = util.drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser=None,
sharrow_enabled=state.settings.sharrow,
)

if nest_spec is None:
logsums = eval_mnl_logsums(
state,
Expand Down
82 changes: 82 additions & 0 deletions activitysim/core/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,3 +638,85 @@ def zarr_file_modification_time(zarr_dir: Path):
if t == 0:
raise FileNotFoundError(zarr_dir)
return t


def drop_unused_columns(
choosers,
spec,
locals_d,
custom_chooser,
sharrow_enabled=False,
additional_columns=None,
):
"""
Drop unused columns from the chooser table, based on the spec and custom_chooser function.
"""
# keep only variables needed for spec
import re

# define a regular expression to find variables in spec
pattern = r"[a-zA-Z_][a-zA-Z0-9_]*"

unique_variables_in_spec = set(
spec.reset_index()["Expression"].apply(lambda x: re.findall(pattern, x)).sum()
)

unique_variables_in_spec |= set(additional_columns or [])

if locals_d:
unique_variables_in_spec.add(locals_d.get("orig_col_name", None))
unique_variables_in_spec.add(locals_d.get("dest_col_name", None))
if locals_d.get("timeframe") == "trip":
orig_col_name = locals_d.get("ORIGIN", None)
dest_col_name = locals_d.get("DESTINATION", None)
stop_col_name = None
parking_col_name = locals_d.get("PARKING", None)
primary_origin_col_name = None
if orig_col_name is None and "od_skims" in locals_d:
orig_col_name = locals_d["od_skims"].orig_key
if dest_col_name is None and "od_skims" in locals_d:
dest_col_name = locals_d["od_skims"].dest_key
if stop_col_name is None and "dp_skims" in locals_d:
stop_col_name = locals_d["dp_skims"].dest_key
if primary_origin_col_name is None and "dnt_skims" in locals_d:
primary_origin_col_name = locals_d["dnt_skims"].dest_key
unique_variables_in_spec.add(orig_col_name)
unique_variables_in_spec.add(dest_col_name)
unique_variables_in_spec.add(parking_col_name)
unique_variables_in_spec.add(primary_origin_col_name)
unique_variables_in_spec.add(stop_col_name)
unique_variables_in_spec.add("trip_period")
# when using trip_scheduling_choice for trup scheduling
unique_variables_in_spec.add("last_outbound_stop")
unique_variables_in_spec.add("last_inbound_stop")

# when sharrow mode, need to keep the following columns in the choosers table
if sharrow_enabled:
unique_variables_in_spec.add("out_period")
unique_variables_in_spec.add("in_period")
unique_variables_in_spec.add("purpose_index_num")

if custom_chooser:
import inspect

custom_chooser_lines = inspect.getsource(custom_chooser)
unique_variables_in_spec.update(re.findall(pattern, custom_chooser_lines))

logger.info("Dropping unused variables in chooser table")

logger.info(
"before dropping, the choosers table has {} columns: {}".format(
len(choosers.columns), choosers.columns
)
)

# keep only variables needed for spec
choosers = choosers[[c for c in choosers.columns if c in unique_variables_in_spec]]
i-am-sijia marked this conversation as resolved.
Show resolved Hide resolved

logger.info(
"after dropping, the choosers table has {} columns: {}".format(
len(choosers.columns), choosers.columns
)
)

return choosers
Loading