From 98eeb534c2a20a8800909c336f03f7fe788fa263 Mon Sep 17 00:00:00 2001 From: Natalie Weires Date: Thu, 22 Feb 2024 16:42:27 +0000 Subject: [PATCH 1/4] Add function to log a summary of simulation statuses at the end of the job. --- buildstockbatch/cloud/docker_base.py | 54 ++++++++++++++++++++++++ buildstockbatch/test/test_docker_base.py | 26 ++++++++++++ 2 files changed, 80 insertions(+) diff --git a/buildstockbatch/cloud/docker_base.py b/buildstockbatch/cloud/docker_base.py index b5bb7f4c..c325d869 100644 --- a/buildstockbatch/cloud/docker_base.py +++ b/buildstockbatch/cloud/docker_base.py @@ -20,6 +20,7 @@ import logging import math import os +import pandas as pd import pathlib import random import shutil @@ -64,6 +65,9 @@ def __init__(self, project_filename): logger.error("The docker server did not respond, make sure Docker Desktop is started then retry.") raise RuntimeError("The docker server did not respond, make sure Docker Desktop is started then retry.") + def get_fs(self): + return LocalFileSystem() + @staticmethod def validate_project(project_file): super(DockerBatchBase, DockerBatchBase).validate_project(project_file) @@ -461,3 +465,53 @@ def run_simulations(cls, cfg, job_id, jobs_d, sim_dir, fs, output_path): shutil.rmtree(item) elif os.path.isfile(item): os.remove(item) + + def log_summary(self): + """ + Log a summary of how many simulations succeeded, failed, or ended with other statuses. + Uses the `completed_status` column of the files in results_csvs/results_*.csv.gz. + """ + fs = self.get_fs() + # Summary of simulation statuses across all upgrades + status_summary = {} + total_counts = collections.defaultdict(int) + + results_csv_dir = f"{self.results_dir}/results_csvs/" + try: + results_files = fs.ls(results_csv_dir) + except FileNotFoundError: + logger.info(f"No results CSV files found at {results_csv_dir}") + return + + for result in results_files: + upgrade_id = result.split(".")[0][-2:] + with fs.open(result) as f: + with gzip.open(f) as gf: + df = pd.read_csv(gf, usecols=["completed_status"]) + # Dict mapping from status (e.g. "Success") to count + statuses = df.groupby("completed_status").size().to_dict() + status_summary[upgrade_id] = statuses + for status, count in statuses.items(): + total_counts[status] += count + + # Always include these statuses and show them first + always_use = ["Success", "Fail"] + all_statuses = always_use + list(total_counts.keys() - set(always_use)) + s = "Final status of all simulations:" + for upgrade, counts in status_summary.items(): + if upgrade == "00": + s += "\nBaseline " + else: + s += f"\nUpgrade {upgrade} " + for status in all_statuses: + s += f"{status}: {counts.get(status, 0):<7d} " + + s += "\n\nTotal " + for status in all_statuses: + s += f"{status}: {total_counts.get(status, 0):<7d} " + s += "\n" + + for upgrade in postprocessing.get_upgrade_list(self.cfg): + if f"{upgrade:02d}" not in status_summary: + s += f"\nNo results found for Upgrade {upgrade}" + logger.info(s) diff --git a/buildstockbatch/test/test_docker_base.py b/buildstockbatch/test/test_docker_base.py index a6829da4..79575e95 100644 --- a/buildstockbatch/test/test_docker_base.py +++ b/buildstockbatch/test/test_docker_base.py @@ -2,6 +2,7 @@ from fsspec.implementations.local import LocalFileSystem import gzip import json +import logging import os import pathlib import shutil @@ -147,3 +148,28 @@ def test_run_simulations(basic_residential_project_file): # Check that files were cleaned up correctly assert not os.listdir(sim_dir) os.chdir(old_cwd) + + +def test_log_summary(basic_residential_project_file, mocker, caplog): + """ + Test logging a summary of simulation statuses. + """ + project_filename, results_dir = basic_residential_project_file() + + mocker.patch.object(DockerBatchBase, "results_dir", results_dir) + dbb = DockerBatchBase(project_filename) + # Add results CSV files + shutil.copytree( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "test_results", + "results_csvs", + ), + os.path.join(results_dir, "results_csvs"), + ) + + with caplog.at_level(logging.INFO): + dbb.log_summary() + assert "Upgrade 01 Success: 4 Fail: 0" in caplog.text + assert "Baseline Success: 4 Fail: 0" in caplog.text + assert "Total Success: 8 Fail: 0" in caplog.text From 795bb6fce346d402073abfa55bf78056fb8298be Mon Sep 17 00:00:00 2001 From: Natalie Weires Date: Thu, 22 Feb 2024 17:28:41 +0000 Subject: [PATCH 2/4] Update changelog --- docs/changelog/changelog_dev.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst index bd619d14..d8e80212 100644 --- a/docs/changelog/changelog_dev.rst +++ b/docs/changelog/changelog_dev.rst @@ -35,3 +35,9 @@ Development Changelog :pullreq: 426 A bugfix for gracefully handling empty data_point_out.json files. + + .. change:: + :tags: general + :pullreq: 435 + + Add helper to log a summary of how many simulations succeeded and failed at the end of a job. From 574ce9518f6138a595088a39dccf78582ad8dca7 Mon Sep 17 00:00:00 2001 From: Natalie Weires Date: Thu, 22 Feb 2024 20:31:27 +0000 Subject: [PATCH 3/4] Read from parquet instead of csv --- buildstockbatch/cloud/docker_base.py | 11 +++++------ buildstockbatch/test/test_docker_base.py | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/buildstockbatch/cloud/docker_base.py b/buildstockbatch/cloud/docker_base.py index c325d869..ab4fb761 100644 --- a/buildstockbatch/cloud/docker_base.py +++ b/buildstockbatch/cloud/docker_base.py @@ -469,25 +469,24 @@ def run_simulations(cls, cfg, job_id, jobs_d, sim_dir, fs, output_path): def log_summary(self): """ Log a summary of how many simulations succeeded, failed, or ended with other statuses. - Uses the `completed_status` column of the files in results_csvs/results_*.csv.gz. + Uses the `completed_status` column of the files in results/parquet/.../results_up*.parquet. """ fs = self.get_fs() # Summary of simulation statuses across all upgrades status_summary = {} total_counts = collections.defaultdict(int) - results_csv_dir = f"{self.results_dir}/results_csvs/" + results_glob = f"{self.results_dir}/parquet/**/results_up*.parquet" try: - results_files = fs.ls(results_csv_dir) + results_files = fs.glob(results_glob) except FileNotFoundError: - logger.info(f"No results CSV files found at {results_csv_dir}") + logger.info(f"No results parquet files found at {results_glob}") return for result in results_files: upgrade_id = result.split(".")[0][-2:] with fs.open(result) as f: - with gzip.open(f) as gf: - df = pd.read_csv(gf, usecols=["completed_status"]) + df = pd.read_parquet(f, columns=["completed_status"]) # Dict mapping from status (e.g. "Success") to count statuses = df.groupby("completed_status").size().to_dict() status_summary[upgrade_id] = statuses diff --git a/buildstockbatch/test/test_docker_base.py b/buildstockbatch/test/test_docker_base.py index 79575e95..c0e10f1a 100644 --- a/buildstockbatch/test/test_docker_base.py +++ b/buildstockbatch/test/test_docker_base.py @@ -163,9 +163,9 @@ def test_log_summary(basic_residential_project_file, mocker, caplog): os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_results", - "results_csvs", + "parquet", ), - os.path.join(results_dir, "results_csvs"), + os.path.join(results_dir, "parquet"), ) with caplog.at_level(logging.INFO): From bc679d59ef95cc8f4657e61f7e894eb985d63c17 Mon Sep 17 00:00:00 2001 From: Natalie Weires Date: Mon, 18 Mar 2024 14:03:37 +0000 Subject: [PATCH 4/4] Remove extra file --- buildstockbatch/sample_one_county.py | 215 --------------------------- 1 file changed, 215 deletions(-) delete mode 100644 buildstockbatch/sample_one_county.py diff --git a/buildstockbatch/sample_one_county.py b/buildstockbatch/sample_one_county.py deleted file mode 100644 index f2550fcd..00000000 --- a/buildstockbatch/sample_one_county.py +++ /dev/null @@ -1,215 +0,0 @@ -"""Runs the residental quota sampler for a single county+PUMA. - -Usage: - python3 sample_one_county.py --help - - python3 sample_one_county.py G1900030 G19001800 100,200 path/to/resstock path/to/output_dir - - - Generates two files where every building has county=G1900030 and PUMA=G19001800: - path/to/output_dir/buildstock_G1900030_G19001800_100.csv with 100 samples - path/to/output_dir/buildstock_G1900030_G19001800_200.csv with 200 samples - -Methodology: - This modifies the conditional probability distributions from the standard ResStock national project - to create a sample limited to a single county+PUMA. (For example, the selected location may normally - be used for 1% of buildings in a national sample, but we update it to get 100% of buildings while - every other location gets 0%.) - - To do this, we modify two files: - - ASHRAE IECC Climate Zone 2004.tsv - - Make 100% of the samples fall into the climate zone of the selected location. - - County and PUMA.tsv - - Make 100% of samples (within the chosen climate zone) fall into the selected county + PUMA - - All other housing characteristics are downstream of these (or don't depend on them) and are unchanged. - -Assumptions: - This logic is only guaranteed to work for the current ResStock national project. Other changes - to the dependencies between the variables can break it! - - In particular, this code assumes: - - ASHRAE climate zone has no dependencies - - County and PUMA depends only on the ASHRAE climate zone - - Each County+PUMA fall entirely in one climate zone -""" -import argparse -import csv -import os -import shutil -import tempfile - -from buildstockbatch.utils import ContainerRuntime -from sampler import residential_quota - - -class SampleOnly: - CONTAINER_RUNTIME = ContainerRuntime.DOCKER - - def __init__(self, buildstock_dir, output_dir): - # Sampler uses this to find the sampling scripts - self.buildstock_dir = os.path.abspath(buildstock_dir) - - # ResStock national project. Could use a different project, but `County and PUMA.tsv` and - # `ASHRAE IECC Climate Zone 2004.tsv` must exist in the expected format. - self.project_dir = os.path.join(self.buildstock_dir, "project_national") - - # Directory containing the conditional probability distributions we plan to modify - self.housing_characteristics_dir = os.path.join(self.project_dir, "housing_characteristics") - self.output_dir = output_dir - os.makedirs(output_dir, exist_ok=True) - - @property - def docker_image(self): - return "nrel/openstudio:{}".format(self.os_version) - - @property - def os_version(self): - return "3.7.0" - - @property - def project_filename(self): - """Sampler expects this property to exist, but it can be None.""" - return None - - def get_climate_zone(self, county, PUMA): - """Given a county and PUMA, find the climate zone that contains them. - - :param county: GISJOIN ID of county (e.g. "G1900030") - :param PUMA: GISJOIN ID of PUMA (e.g. "G19001800") - - :return: Climate zone string (e.g. "3A") - """ - with open(os.path.join(self.housing_characteristics_dir, "County and PUMA.tsv")) as f: - reader = csv.reader(f, delimiter="\t") - headers = next(reader) - # Index of the column with the county and PUMA we're looking for. - try: - location_col = headers.index(f"Option={county}, {PUMA}") - except ValueError as e: - raise ValueError(f"Could not find 'Option={county}, {PUMA}' column in 'County and PUMA.tsv'") from e - - zone = None - for row in reader: - # Skip comments - if row[0].strip()[0] == "#": - continue - - # Find the zone with a non-zero chance of producing this county + PUMA - if row[location_col] != "0": - if zone: - raise ValueError(f"Found multiple climate zones for {county}, {PUMA}") - zone = row[0] - - if not zone: - raise ValueError(f"No climate zone found for {county}, {PUMA}") - return zone - - def run_sampler(self, county, PUMA, n_samples): - """ - Create the requested number of buildings, all contained in the given county and PUMA. - - This function: - - Updates the conditional probability distributions for climate zone and county + PUMA. - - Runs the ResidentialQuotaSampler. - - Renames and copies the resulting building.csv file into the output directory. - - :param county: GISJOIN ID of county (e.g. "G1900030") - :param PUMA: GISJOIN ID of PUMA (e.g. "G19001800") - :param n_samples: Number of building samples to produce. - """ - - climate_zone = self.get_climate_zone(county, PUMA) - # Create a new copy of the probability distribution TSV files, so we can change them without - # affecting the originals. - with tempfile.TemporaryDirectory(prefix="sampling_", dir=self.buildstock_dir) as tmpdir: - temp_housing_characteristics_dir = os.path.join(tmpdir, "housing_characteristics") - shutil.copytree(self.housing_characteristics_dir, temp_housing_characteristics_dir) - - # Update climate zone TSV - climate_zone_filename = "ASHRAE IECC Climate Zone 2004.tsv" - zone_tsv = os.path.join(self.housing_characteristics_dir, climate_zone_filename) - new_zone_tsv = os.path.join(temp_housing_characteristics_dir, climate_zone_filename) - with open(zone_tsv) as old_f: - reader = csv.reader(old_f, delimiter="\t") - with open(new_zone_tsv, "w") as new_f: - writer = csv.writer(new_f, delimiter="\t") - headers = next(reader) - writer.writerow(headers) - - # This file has a single row of probabilities, which we replace with 0s and a single 1. - zone_header = f"Option={climate_zone}" - writer.writerow(["1" if header == zone_header else "0" for header in headers]) - - # Update county + PUMA TSV - county_filename = "County and PUMA.tsv" - county_tsv = os.path.join(self.housing_characteristics_dir, county_filename) - new_county_tsv = os.path.join(temp_housing_characteristics_dir, county_filename) - with open(county_tsv) as old_f: - reader = csv.reader(old_f, delimiter="\t") - with open(new_county_tsv, "w") as new_f: - writer = csv.writer(new_f, delimiter="\t") - headers = next(reader) - writer.writerow(headers) - - # First value in headers lists the climate zone dependency - - # just use the others, which list the County+PUMA options. - assert headers[0] == "Dependency=ASHRAE IECC Climate Zone 2004" - headers = headers[1:] - for row in reader: - # Skip comments - if row[0].strip()[0] == "#": - continue - - elif row[0] == climate_zone: - # Replace probabilities with 1 for our selected location and 0s everywhere else. - county_header = f"Option={county}, {PUMA}" - writer.writerow( - [row[0]] + ["1" if headers[i] == county_header else "0" for i, v in enumerate(row[1:])] - ) - - else: - # Leave other climate zones unchanged - they won't be used anyway. - writer.writerow(row) - - self.cfg = {"project_directory": os.path.basename(tmpdir)} - self.project_dir = tmpdir - - # Note: Must create sampler after all instances vars exist, because it makes a copy of this object. - sampler = residential_quota.ResidentialQuotaSampler(self, n_samples) - sampler.run_sampling() - - # Copy results from temp dir to output dir - shutil.copy( - os.path.join(temp_housing_characteristics_dir, "buildstock.csv"), - os.path.join(self.output_dir, f"buildstock_{county}_{PUMA}_{n_samples}.csv"), - ) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("county", help="County GISJOIN ID - https://www.nhgis.org/geographic-crosswalks#geog-ids") - parser.add_argument("PUMA", help="PUMA GISJOIN ID") - parser.add_argument("n_samples", help="Comma-separated list of samples sizes to generate") - parser.add_argument("buildstock_dir", help="Path to the ResStock directory (expected to contain project_national)") - parser.add_argument( - "output_dir", - default=".", - nargs="?", - help="Optional path where output should be written. Defaults to the current directory.", - ) - args = parser.parse_args() - - assert ( - len(args.county) == 8 and args.county[0] == "G" - ), "County should be 8 chars and start with G (e.g. 'G0100010')" - assert len(args.PUMA) == 9 and args.PUMA[0] == "G", "PUMA should be 9 chars and start with G (e.g. 'G01002100')" - - sample_sizes = [int(i) for i in args.n_samples.split(",")] - s = SampleOnly(args.buildstock_dir, args.output_dir) - for i in sample_sizes: - print(f"Creating {i} samples...") - s.run_sampler(args.county, args.PUMA, i) - - -if __name__ == "__main__": - main()