Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to use pandas 2.x #838

Draft
wants to merge 18 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 12 additions & 18 deletions .github/workflows/core_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,12 @@ jobs:
"psutil=5.9.5" \
"pydantic=2.6.1" \
"pypyr=5.8.0" \
"pytables=3.6.1" \
"pytables=3.9.2" \
"pytest-cov" \
"pytest-regressions=2.5.0" \
"scikit-learn=1.2.2" \
"sharrow>=2.6.0" \
"sharrow>=2.7.0" \
"simwrapper=1.8.5" \
"xarray=2023.2.0" \
"zarr=2.14.2" \
"zstandard=0.21.0"
if: steps.cache.outputs.cache-hit != 'true'
Expand Down Expand Up @@ -151,13 +150,12 @@ jobs:
"psutil=5.9.5" \
"pydantic=2.6.1" \
"pypyr=5.8.0" \
"pytables=3.6.1" \
"pytables=3.9.2" \
"pytest-cov" \
"pytest-regressions=2.5.0" \
"scikit-learn=1.2.2" \
"sharrow>=2.6.0" \
"sharrow>=2.7.0" \
"simwrapper=1.8.5" \
"xarray=2023.2.0" \
"zarr=2.14.2" \
"zstandard=0.21.0"
if: steps.cache.outputs.cache-hit != 'true'
Expand Down Expand Up @@ -249,13 +247,12 @@ jobs:
"psutil=5.9.5" \
"pydantic=2.6.1" \
"pypyr=5.8.0" \
"pytables=3.6.1" \
"pytables=3.9.2" \
"pytest-cov" \
"pytest-regressions=2.5.0" \
"scikit-learn=1.2.2" \
"sharrow>=2.6.0" \
"sharrow>=2.7.0" \
"simwrapper=1.8.5" \
"xarray=2023.2.0" \
"zarr=2.14.2" \
"zstandard=0.21.0"
if: steps.cache.outputs.cache-hit != 'true'
Expand Down Expand Up @@ -346,13 +343,12 @@ jobs:
"psutil=5.9.5" \
"pydantic=2.6.1" \
"pypyr=5.8.0" \
"pytables=3.6.1" \
"pytables=3.9.2" \
"pytest-cov" \
"pytest-regressions=2.5.0" \
"scikit-learn=1.2.2" \
"sharrow>=2.6.0" \
"sharrow>=2.7.0" \
"simwrapper=1.8.5" \
"xarray=2023.2.0" \
"zarr=2.14.2" \
"zstandard=0.21.0"
if: steps.cache.outputs.cache-hit != 'true'
Expand Down Expand Up @@ -413,13 +409,12 @@ jobs:
"psutil=5.9.5" \
"pydantic=2.6.1" \
"pypyr=5.8.0" \
"pytables=3.6.1" \
"pytables=3.9.2" \
"pytest-cov" \
"pytest-regressions=2.5.0" \
"scikit-learn=1.2.2" \
"sharrow>=2.6.0" \
"sharrow>=2.7.0" \
"simwrapper=1.8.5" \
"xarray=2023.2.0" \
"zarr=2.14.2" \
"zstandard=0.21.0"
if: steps.cache.outputs.cache-hit != 'true'
Expand Down Expand Up @@ -479,13 +474,12 @@ jobs:
"psutil=5.9.5" \
"pydantic=2.6.1" \
"pypyr=5.8.0" \
"pytables=3.6.1" \
"pytables=3.9.2" \
"pytest-cov" \
"pytest-regressions=2.5.0" \
"scikit-learn=1.2.2" \
"sharrow>=2.6.0" \
"sharrow>=2.7.0" \
"simwrapper=1.8.5" \
"xarray=2023.2.0" \
"zarr=2.14.2" \
"zstandard=0.21.0"
if: steps.cache.outputs.cache-hit != 'true'
Expand Down
7 changes: 5 additions & 2 deletions activitysim/abm/models/disaggregate_accessibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ def expand_template_zones(self, tables):
_expanded = pd.DataFrame(util.named_product(**index_params)).set_index("index")

# Use result to join template onto expanded table of zones
ex_table = _expanded.join(master_template).reset_index()
ex_table = _expanded.join(master_template).sort_index().reset_index()

# Concatenate a new unique set of ids
cols = ["home_zone_id", "proto_household_id", "proto_person_id"]
Expand Down Expand Up @@ -642,7 +642,9 @@ def create_proto_pop(self):
.set_index("index")
.rename(columns={"hhid": hhid})
)
persons = rep.join(persons).sort_values(hhid).reset_index(drop=True)
persons = (
rep.join(persons, sort=True).sort_values(hhid).reset_index(drop=True)
)
persons[perid] = persons.index + 1

# Assign persons to tours
Expand Down Expand Up @@ -718,6 +720,7 @@ def merge_persons(self):

perid = self.params["proto_persons"]["index_col"]
persons_merged.set_index(perid, inplace=True, drop=True)
persons_merged = persons_merged.sort_index()
self.proto_pop["proto_persons_merged"] = persons_merged

# Store in pipeline
Expand Down
60 changes: 34 additions & 26 deletions activitysim/abm/models/input_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,34 +300,42 @@ def report_errors(state, input_checker_settings, v_warnings, v_errors):

for warn in warns:
if "dataframe validator" in str(warn.message):
file_logger.warning(
"Failed dataframe validator: "
+ str(warn.message).split("\n")[-1]
)
elif "element-wise validator" in str(warn.message):
if "DataFrameSchema" in str(warn.message):
file_logger.warning(
"Failed element-wise validator: <"
+ str(warn.message).split("\n")[0].split(" ")[1]
+ table_name
+ ")>\n\t"
+ str(warn.message)
.split("failure cases:\n")[0]
.split("\n")[-2]
+ "\n\tfailure cases:\n\t"
+ "\n\t".join(
str(warn.message)
.split("failure cases:\n")[1]
.split("\n")
)
)
else:
try:
file_logger.warning(
"Failed element-wise validator: <"
+ " ".join(str(warn.message).split("\n")[0].split(" ")[1:3])
+ "\n\t"
+ "\n\t".join(str(warn.message).split("\n")[1:])
"Failed dataframe validator: "
+ str(warn.message).split("\n")[-1]
)
except Exception:
file_logger.warning(warn)
elif "element-wise validator" in str(warn.message):
try:
if "DataFrameSchema" in str(warn.message):
file_logger.warning(
"Failed element-wise validator: <"
+ str(warn.message).split("\n")[0].split(" ")[1]
+ table_name
+ ")>\n\t"
+ str(warn.message)
.split("failure cases:\n")[0]
.split("\n")[-2]
+ "\n\tfailure cases:\n\t"
+ "\n\t".join(
str(warn.message)
.split("failure cases:\n")[1]
.split("\n")
)
)
else:
file_logger.warning(
"Failed element-wise validator: <"
+ " ".join(
str(warn.message).split("\n")[0].split(" ")[1:3]
)
+ "\n\t"
+ "\n\t".join(str(warn.message).split("\n")[1:])
)
except Exception:
file_logger.warning(warn)
else:
file_logger.warning(warn)
file_logger.warning("\n")
Expand Down
2 changes: 1 addition & 1 deletion activitysim/abm/models/school_escorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def school_escorting(
state.add_table("tours", tours)
state.get_rn_generator().drop_channel("tours")
state.get_rn_generator().add_channel("tours", tours)
state.add_table("escort_bundles", escort_bundles)
state.add_table("escort_bundles", escort_bundles.reset_index(drop=True))
# save school escorting tours and trips in pipeline so we can overwrite results from downstream models
state.add_table("school_escort_tours", school_escort_tours)
state.add_table("school_escort_trips", school_escort_trips)
Expand Down
2 changes: 1 addition & 1 deletion activitysim/abm/models/trip_departure_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def choose_tour_leg_pattern(


def apply_stage_two_model(state, omnibus_spec, trips, chunk_size, trace_label):
if not trips.index.is_monotonic:
if not trips.index.is_monotonic_increasing:
trips = trips.sort_index()

# Assign the duration of the appropriate leg to the trip
Expand Down
4 changes: 2 additions & 2 deletions activitysim/abm/models/util/school_escort_tours_trips.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def create_chauf_escort_trips(bundles):
"outbound",
"purpose",
]
).reset_index()
).reset_index(drop=True)

# numbering trips such that outbound escorting trips must come first and inbound trips must come last
outbound_trip_num = -1 * (
Expand Down Expand Up @@ -240,7 +240,7 @@ def create_escortee_trips(bundles):
# create a new trip for each escortee destination
escortee_trips = escortee_trips.explode(
["destination", "escort_participants", "school_escort_trip_num", "purpose"]
).reset_index()
).reset_index(drop=True)

# numbering trips such that outbound escorting trips must come first and inbound trips must come last
# this comes in handy when merging trips to others in the tour decided downstream
Expand Down
3 changes: 3 additions & 0 deletions activitysim/abm/models/vehicle_allocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ def vehicle_allocation(
logger.info("Running for occupancy = %d", occup)
# setting occup for access in spec expressions
locals_dict.update({"occup": occup})
if model_settings.sharrow_skip:
locals_dict["disable_sharrow"] = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My memory might be sloppy. Why possibly opting out sharrow for vehicle allocation?


choices = simulate.simple_simulate(
state,
Expand All @@ -258,6 +260,7 @@ def vehicle_allocation(
choices.loc[choices["alt_choice"] == alt, "choice"] = choosers.loc[
choices["alt_choice"] == alt, alt
]
choices["choice"] = choices["choice"].astype(veh_choice_dtype)
choices.loc[
choices["alt_choice"] == alts_from_spec[-1], "choice"
] = alts_from_spec[-1]
Expand Down
17 changes: 10 additions & 7 deletions activitysim/cli/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import glob
import hashlib
import importlib.resources
import logging
import os
import shutil
Expand All @@ -21,14 +22,15 @@

def _example_path(resource):
resource = os.path.join(EXAMPLES_DIR, resource)
path = pkg_resources.resource_filename(PACKAGE, resource)

return path
return importlib.resources.as_file(
importlib.resources.files(PACKAGE).joinpath(resource)
)


def _load_manifest():
with open(_example_path(MANIFEST), "r") as f:
manifest = yaml.safe_load(f.read())
with _example_path(MANIFEST) as f_pth:
with open(f_pth, "r") as f:
manifest = yaml.safe_load(f.read())

assert manifest, f"error: could not load {MANIFEST}"
return {example["name"]: example for example in manifest}
Expand Down Expand Up @@ -177,8 +179,9 @@ def get_example(
)

else:
for asset_path in glob.glob(_example_path(assets)):
copy_asset(asset_path, target_path, dirs_exist_ok=True)
with _example_path(assets) as pth:
for asset_path in glob.glob(str(pth)):
copy_asset(asset_path, target_path, dirs_exist_ok=True)

print(f"copied! new project files are in {os.path.abspath(dest_path)}")

Expand Down
31 changes: 30 additions & 1 deletion activitysim/core/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,36 @@ def read_assignment_spec(
"""

try:
cfg = pd.read_csv(file_name, comment="#")
# we use an explicit list of na_values, these are the values that
# Pandas version 1.5 recognized as NaN by default. Notably absent is
# 'None' which is used in some spec files to be the object `None` not
# the float value NaN.
cfg = pd.read_csv(
file_name,
comment="#",
na_values=[
"",
"#N/A",
"#N/A N/A",
"#NA",
"-1.#IND",
"-1.#QNAN",
"-NaN",
"-nan",
"1.#IND",
"1.#QNAN",
"<NA>",
"N/A",
"NA",
"NULL",
"NaN",
"n/a",
"nan",
"null",
],
keep_default_na=False,
)

except Exception as e:
logger.error(f"Error reading spec file: {file_name}")
logger.error(str(e))
Expand Down
10 changes: 9 additions & 1 deletion activitysim/core/los.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,15 @@ def get_mazpairs(self, omaz, dmaz, attribute):
self.maz_ceiling
) + np.asanyarray(dmaz, dtype=np.int64)
else:
i = np.asanyarray(omaz) * self.maz_ceiling + np.asanyarray(dmaz)
# if we have less than a 32-bit index, it will
# overflow so we need to upgrade to at least 32 bit
omaz_as_array = np.asanyarray(omaz)
if omaz_as_array.dtype not in (np.int32, np.int64):
omaz_as_array = omaz_as_array.astype(np.int32)
dmaz_as_array = np.asanyarray(dmaz)
if dmaz_as_array.dtype not in (np.int32, np.int64):
dmaz_as_array = dmaz_as_array.astype(np.int32)
i = omaz_as_array * self.maz_ceiling + dmaz_as_array
s = util.quick_loc_df(i, self.maz_to_maz_df, attribute)

# FIXME - no point in returning series?
Expand Down
4 changes: 2 additions & 2 deletions activitysim/core/test/_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ def progressive_checkpoint_test(
if ref_target.exists():
try:
state.checkpoint.check_against(ref_target, checkpoint_name=step_name)
except Exception:
print(f"> {name} {step_name}: ERROR")
except Exception as e:
print(f"> {name} {step_name}: ERROR {e}")
raise
else:
print(f"> {name} {step_name}: ok")
Expand Down
2 changes: 1 addition & 1 deletion activitysim/core/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def quick_loc_series(loc_list, target_series):

left_on = "left"

if isinstance(loc_list, pd.Int64Index):
if isinstance(loc_list, pd.Index):
left_df = pd.DataFrame({left_on: loc_list.values})
elif isinstance(loc_list, pd.Series):
left_df = loc_list.to_frame(name=left_on)
Expand Down
13 changes: 12 additions & 1 deletion activitysim/core/workflow/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,18 @@ def get_pyarrow(
if t is None:
raise KeyError(tablename)
if isinstance(t, pd.DataFrame):
t = pa.Table.from_pandas(t, preserve_index=True, columns=columns)
df = t
try:
t = pa.Table.from_pandas(df, preserve_index=True, columns=columns)
except (pa.ArrowTypeError, pa.ArrowInvalid):
# if there are object columns, try to convert them to categories
df = df.copy()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw your latest comment about significantly longer run time with this PR. I noticed you are calling copy() here. In pandas 2.0 copy() defaults to a deep copy. I wonder if this contributed to the run time?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is causing the problem. This code only executes in the write_tables step at the end of the model run.

for k, dtype in df.dtypes.items():
if dtype.kind == "O":
df[k] = df[k].astype("str")
elif dtype == "boolean":
df[k] = df[k].astype("str")
t = pa.Table.from_pandas(df, preserve_index=True, columns=columns)
if isinstance(t, pa.Table):
if columns is not None:
t = t.select(columns)
Expand Down
Loading
Loading