From 98eeb534c2a20a8800909c336f03f7fe788fa263 Mon Sep 17 00:00:00 2001
From: Natalie Weires <nweires@google.com>
Date: Thu, 22 Feb 2024 16:42:27 +0000
Subject: [PATCH 1/4] Add function to log a summary of simulation statuses at
 the end of the job.

---
 buildstockbatch/cloud/docker_base.py     | 54 ++++++++++++++++++++++++
 buildstockbatch/test/test_docker_base.py | 26 ++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/buildstockbatch/cloud/docker_base.py b/buildstockbatch/cloud/docker_base.py
index b5bb7f4c..c325d869 100644
--- a/buildstockbatch/cloud/docker_base.py
+++ b/buildstockbatch/cloud/docker_base.py
@@ -20,6 +20,7 @@
 import logging
 import math
 import os
+import pandas as pd
 import pathlib
 import random
 import shutil
@@ -64,6 +65,9 @@ def __init__(self, project_filename):
             logger.error("The docker server did not respond, make sure Docker Desktop is started then retry.")
             raise RuntimeError("The docker server did not respond, make sure Docker Desktop is started then retry.")
 
+    def get_fs(self):
+        return LocalFileSystem()
+
     @staticmethod
     def validate_project(project_file):
         super(DockerBatchBase, DockerBatchBase).validate_project(project_file)
@@ -461,3 +465,53 @@ def run_simulations(cls, cfg, job_id, jobs_d, sim_dir, fs, output_path):
                 shutil.rmtree(item)
             elif os.path.isfile(item):
                 os.remove(item)
+
+    def log_summary(self):
+        """
+        Log a summary of how many simulations succeeded, failed, or ended with other statuses.
+        Uses the `completed_status` column of the files in results_csvs/results_*.csv.gz.
+        """
+        fs = self.get_fs()
+        # Summary of simulation statuses across all upgrades
+        status_summary = {}
+        total_counts = collections.defaultdict(int)
+
+        results_csv_dir = f"{self.results_dir}/results_csvs/"
+        try:
+            results_files = fs.ls(results_csv_dir)
+        except FileNotFoundError:
+            logger.info(f"No results CSV files found at {results_csv_dir}")
+            return
+
+        for result in results_files:
+            upgrade_id = result.split(".")[0][-2:]
+            with fs.open(result) as f:
+                with gzip.open(f) as gf:
+                    df = pd.read_csv(gf, usecols=["completed_status"])
+            # Dict mapping from status (e.g. "Success") to count
+            statuses = df.groupby("completed_status").size().to_dict()
+            status_summary[upgrade_id] = statuses
+            for status, count in statuses.items():
+                total_counts[status] += count
+
+        # Always include these statuses and show them first
+        always_use = ["Success", "Fail"]
+        all_statuses = always_use + list(total_counts.keys() - set(always_use))
+        s = "Final status of all simulations:"
+        for upgrade, counts in status_summary.items():
+            if upgrade == "00":
+                s += "\nBaseline     "
+            else:
+                s += f"\nUpgrade {upgrade}   "
+            for status in all_statuses:
+                s += f"{status}: {counts.get(status, 0):<7d}  "
+
+        s += "\n\nTotal        "
+        for status in all_statuses:
+            s += f"{status}: {total_counts.get(status, 0):<7d}  "
+        s += "\n"
+
+        for upgrade in postprocessing.get_upgrade_list(self.cfg):
+            if f"{upgrade:02d}" not in status_summary:
+                s += f"\nNo results found for Upgrade {upgrade}"
+        logger.info(s)
diff --git a/buildstockbatch/test/test_docker_base.py b/buildstockbatch/test/test_docker_base.py
index a6829da4..79575e95 100644
--- a/buildstockbatch/test/test_docker_base.py
+++ b/buildstockbatch/test/test_docker_base.py
@@ -2,6 +2,7 @@
 from fsspec.implementations.local import LocalFileSystem
 import gzip
 import json
+import logging
 import os
 import pathlib
 import shutil
@@ -147,3 +148,28 @@ def test_run_simulations(basic_residential_project_file):
         # Check that files were cleaned up correctly
         assert not os.listdir(sim_dir)
         os.chdir(old_cwd)
+
+
+def test_log_summary(basic_residential_project_file, mocker, caplog):
+    """
+    Test logging a summary of simulation statuses.
+    """
+    project_filename, results_dir = basic_residential_project_file()
+
+    mocker.patch.object(DockerBatchBase, "results_dir", results_dir)
+    dbb = DockerBatchBase(project_filename)
+    # Add results CSV files
+    shutil.copytree(
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            "test_results",
+            "results_csvs",
+        ),
+        os.path.join(results_dir, "results_csvs"),
+    )
+
+    with caplog.at_level(logging.INFO):
+        dbb.log_summary()
+        assert "Upgrade 01   Success: 4        Fail: 0" in caplog.text
+        assert "Baseline     Success: 4        Fail: 0" in caplog.text
+        assert "Total        Success: 8        Fail: 0" in caplog.text

From 795bb6fce346d402073abfa55bf78056fb8298be Mon Sep 17 00:00:00 2001
From: Natalie Weires <nweires@google.com>
Date: Thu, 22 Feb 2024 17:28:41 +0000
Subject: [PATCH 2/4] Update changelog

---
 docs/changelog/changelog_dev.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst
index bd619d14..d8e80212 100644
--- a/docs/changelog/changelog_dev.rst
+++ b/docs/changelog/changelog_dev.rst
@@ -35,3 +35,9 @@ Development Changelog
         :pullreq: 426
 
         A bugfix for gracefully handling empty data_point_out.json files.
+
+    .. change::
+        :tags: general
+        :pullreq: 435
+
+        Add helper to log a summary of how many simulations succeeded and failed at the end of a job.

From 574ce9518f6138a595088a39dccf78582ad8dca7 Mon Sep 17 00:00:00 2001
From: Natalie Weires <nweires@google.com>
Date: Thu, 22 Feb 2024 20:31:27 +0000
Subject: [PATCH 3/4] Read from parquet instead of csv

---
 buildstockbatch/cloud/docker_base.py     | 11 +++++------
 buildstockbatch/test/test_docker_base.py |  4 ++--
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/buildstockbatch/cloud/docker_base.py b/buildstockbatch/cloud/docker_base.py
index c325d869..ab4fb761 100644
--- a/buildstockbatch/cloud/docker_base.py
+++ b/buildstockbatch/cloud/docker_base.py
@@ -469,25 +469,24 @@ def run_simulations(cls, cfg, job_id, jobs_d, sim_dir, fs, output_path):
     def log_summary(self):
         """
         Log a summary of how many simulations succeeded, failed, or ended with other statuses.
-        Uses the `completed_status` column of the files in results_csvs/results_*.csv.gz.
+        Uses the `completed_status` column of the files in results/parquet/.../results_up*.parquet.
         """
         fs = self.get_fs()
         # Summary of simulation statuses across all upgrades
         status_summary = {}
         total_counts = collections.defaultdict(int)
 
-        results_csv_dir = f"{self.results_dir}/results_csvs/"
+        results_glob = f"{self.results_dir}/parquet/**/results_up*.parquet"
         try:
-            results_files = fs.ls(results_csv_dir)
+            results_files = fs.glob(results_glob)
         except FileNotFoundError:
-            logger.info(f"No results CSV files found at {results_csv_dir}")
+            logger.info(f"No results parquet files found at {results_glob}")
             return
 
         for result in results_files:
             upgrade_id = result.split(".")[0][-2:]
             with fs.open(result) as f:
-                with gzip.open(f) as gf:
-                    df = pd.read_csv(gf, usecols=["completed_status"])
+                df = pd.read_parquet(f, columns=["completed_status"])
             # Dict mapping from status (e.g. "Success") to count
             statuses = df.groupby("completed_status").size().to_dict()
             status_summary[upgrade_id] = statuses
diff --git a/buildstockbatch/test/test_docker_base.py b/buildstockbatch/test/test_docker_base.py
index 79575e95..c0e10f1a 100644
--- a/buildstockbatch/test/test_docker_base.py
+++ b/buildstockbatch/test/test_docker_base.py
@@ -163,9 +163,9 @@ def test_log_summary(basic_residential_project_file, mocker, caplog):
         os.path.join(
             os.path.dirname(os.path.abspath(__file__)),
             "test_results",
-            "results_csvs",
+            "parquet",
         ),
-        os.path.join(results_dir, "results_csvs"),
+        os.path.join(results_dir, "parquet"),
     )
 
     with caplog.at_level(logging.INFO):

From bc679d59ef95cc8f4657e61f7e894eb985d63c17 Mon Sep 17 00:00:00 2001
From: Natalie Weires <nweires@google.com>
Date: Mon, 18 Mar 2024 14:03:37 +0000
Subject: [PATCH 4/4] Remove extra file

---
 buildstockbatch/sample_one_county.py | 215 ---------------------------
 1 file changed, 215 deletions(-)
 delete mode 100644 buildstockbatch/sample_one_county.py

diff --git a/buildstockbatch/sample_one_county.py b/buildstockbatch/sample_one_county.py
deleted file mode 100644
index f2550fcd..00000000
--- a/buildstockbatch/sample_one_county.py
+++ /dev/null
@@ -1,215 +0,0 @@
-"""Runs the residental quota sampler for a single county+PUMA.
-
-Usage:
-    python3 sample_one_county.py --help
-
-    python3 sample_one_county.py G1900030 G19001800 100,200 path/to/resstock path/to/output_dir
-
-        - Generates two files where every building has county=G1900030 and PUMA=G19001800:
-            path/to/output_dir/buildstock_G1900030_G19001800_100.csv with 100 samples
-            path/to/output_dir/buildstock_G1900030_G19001800_200.csv with 200 samples
-
-Methodology:
-    This modifies the conditional probability distributions from the standard ResStock national project
-    to create a sample limited to a single county+PUMA. (For example, the selected location may normally
-    be used for 1% of buildings in a national sample, but we update it to get 100% of buildings while
-    every other location gets 0%.)
-
-    To do this, we modify two files:
-    - ASHRAE IECC Climate Zone 2004.tsv
-        - Make 100% of the samples fall into the climate zone of the selected location.
-    - County and PUMA.tsv
-        - Make 100% of samples (within the chosen climate zone) fall into the selected county + PUMA
-
-    All other housing characteristics are downstream of these (or don't depend on them) and are unchanged.
-
-Assumptions:
-    This logic is only guaranteed to work for the current ResStock national project. Other changes
-    to the dependencies between the variables can break it!
-
-    In particular, this code assumes:
-        - ASHRAE climate zone has no dependencies
-        - County and PUMA depends only on the ASHRAE climate zone
-        - Each County+PUMA fall entirely in one climate zone
-"""
-import argparse
-import csv
-import os
-import shutil
-import tempfile
-
-from buildstockbatch.utils import ContainerRuntime
-from sampler import residential_quota
-
-
-class SampleOnly:
-    CONTAINER_RUNTIME = ContainerRuntime.DOCKER
-
-    def __init__(self, buildstock_dir, output_dir):
-        # Sampler uses this to find the sampling scripts
-        self.buildstock_dir = os.path.abspath(buildstock_dir)
-
-        # ResStock national project. Could use a different project, but `County and PUMA.tsv` and
-        # `ASHRAE IECC Climate Zone 2004.tsv` must exist in the expected format.
-        self.project_dir = os.path.join(self.buildstock_dir, "project_national")
-
-        # Directory containing the conditional probability distributions we plan to modify
-        self.housing_characteristics_dir = os.path.join(self.project_dir, "housing_characteristics")
-        self.output_dir = output_dir
-        os.makedirs(output_dir, exist_ok=True)
-
-    @property
-    def docker_image(self):
-        return "nrel/openstudio:{}".format(self.os_version)
-
-    @property
-    def os_version(self):
-        return "3.7.0"
-
-    @property
-    def project_filename(self):
-        """Sampler expects this property to exist, but it can be None."""
-        return None
-
-    def get_climate_zone(self, county, PUMA):
-        """Given a county and PUMA, find the climate zone that contains them.
-
-        :param county: GISJOIN ID of county (e.g. "G1900030")
-        :param PUMA: GISJOIN ID of PUMA (e.g. "G19001800")
-
-        :return: Climate zone string (e.g. "3A")
-        """
-        with open(os.path.join(self.housing_characteristics_dir, "County and PUMA.tsv")) as f:
-            reader = csv.reader(f, delimiter="\t")
-            headers = next(reader)
-            # Index of the column with the county and PUMA we're looking for.
-            try:
-                location_col = headers.index(f"Option={county}, {PUMA}")
-            except ValueError as e:
-                raise ValueError(f"Could not find 'Option={county}, {PUMA}' column in 'County and PUMA.tsv'") from e
-
-            zone = None
-            for row in reader:
-                # Skip comments
-                if row[0].strip()[0] == "#":
-                    continue
-
-                # Find the zone with a non-zero chance of producing this county + PUMA
-                if row[location_col] != "0":
-                    if zone:
-                        raise ValueError(f"Found multiple climate zones for {county}, {PUMA}")
-                    zone = row[0]
-
-            if not zone:
-                raise ValueError(f"No climate zone found for {county}, {PUMA}")
-            return zone
-
-    def run_sampler(self, county, PUMA, n_samples):
-        """
-        Create the requested number of buildings, all contained in the given county and PUMA.
-
-        This function:
-            - Updates the conditional probability distributions for climate zone and county + PUMA.
-            - Runs the ResidentialQuotaSampler.
-            - Renames and copies the resulting building.csv file into the output directory.
-
-        :param county: GISJOIN ID of county (e.g. "G1900030")
-        :param PUMA: GISJOIN ID of PUMA (e.g. "G19001800")
-        :param n_samples: Number of building samples to produce.
-        """
-
-        climate_zone = self.get_climate_zone(county, PUMA)
-        # Create a new copy of the probability distribution TSV files, so we can change them without
-        # affecting the originals.
-        with tempfile.TemporaryDirectory(prefix="sampling_", dir=self.buildstock_dir) as tmpdir:
-            temp_housing_characteristics_dir = os.path.join(tmpdir, "housing_characteristics")
-            shutil.copytree(self.housing_characteristics_dir, temp_housing_characteristics_dir)
-
-            # Update climate zone TSV
-            climate_zone_filename = "ASHRAE IECC Climate Zone 2004.tsv"
-            zone_tsv = os.path.join(self.housing_characteristics_dir, climate_zone_filename)
-            new_zone_tsv = os.path.join(temp_housing_characteristics_dir, climate_zone_filename)
-            with open(zone_tsv) as old_f:
-                reader = csv.reader(old_f, delimiter="\t")
-                with open(new_zone_tsv, "w") as new_f:
-                    writer = csv.writer(new_f, delimiter="\t")
-                    headers = next(reader)
-                    writer.writerow(headers)
-
-                    # This file has a single row of probabilities, which we replace with 0s and a single 1.
-                    zone_header = f"Option={climate_zone}"
-                    writer.writerow(["1" if header == zone_header else "0" for header in headers])
-
-            # Update county + PUMA TSV
-            county_filename = "County and PUMA.tsv"
-            county_tsv = os.path.join(self.housing_characteristics_dir, county_filename)
-            new_county_tsv = os.path.join(temp_housing_characteristics_dir, county_filename)
-            with open(county_tsv) as old_f:
-                reader = csv.reader(old_f, delimiter="\t")
-                with open(new_county_tsv, "w") as new_f:
-                    writer = csv.writer(new_f, delimiter="\t")
-                    headers = next(reader)
-                    writer.writerow(headers)
-
-                    # First value in headers lists the climate zone dependency -
-                    # just use the others, which list the County+PUMA options.
-                    assert headers[0] == "Dependency=ASHRAE IECC Climate Zone 2004"
-                    headers = headers[1:]
-                    for row in reader:
-                        # Skip comments
-                        if row[0].strip()[0] == "#":
-                            continue
-
-                        elif row[0] == climate_zone:
-                            # Replace probabilities with 1 for our selected location and 0s everywhere else.
-                            county_header = f"Option={county}, {PUMA}"
-                            writer.writerow(
-                                [row[0]] + ["1" if headers[i] == county_header else "0" for i, v in enumerate(row[1:])]
-                            )
-
-                        else:
-                            # Leave other climate zones unchanged - they won't be used anyway.
-                            writer.writerow(row)
-
-            self.cfg = {"project_directory": os.path.basename(tmpdir)}
-            self.project_dir = tmpdir
-
-            # Note: Must create sampler after all instances vars exist, because it makes a copy of this object.
-            sampler = residential_quota.ResidentialQuotaSampler(self, n_samples)
-            sampler.run_sampling()
-
-            # Copy results from temp dir to output dir
-            shutil.copy(
-                os.path.join(temp_housing_characteristics_dir, "buildstock.csv"),
-                os.path.join(self.output_dir, f"buildstock_{county}_{PUMA}_{n_samples}.csv"),
-            )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("county", help="County GISJOIN ID - https://www.nhgis.org/geographic-crosswalks#geog-ids")
-    parser.add_argument("PUMA", help="PUMA GISJOIN ID")
-    parser.add_argument("n_samples", help="Comma-separated list of samples sizes to generate")
-    parser.add_argument("buildstock_dir", help="Path to the ResStock directory (expected to contain project_national)")
-    parser.add_argument(
-        "output_dir",
-        default=".",
-        nargs="?",
-        help="Optional path where output should be written. Defaults to the current directory.",
-    )
-    args = parser.parse_args()
-
-    assert (
-        len(args.county) == 8 and args.county[0] == "G"
-    ), "County should be 8 chars and start with G (e.g. 'G0100010')"
-    assert len(args.PUMA) == 9 and args.PUMA[0] == "G", "PUMA should be 9 chars and start with G (e.g. 'G01002100')"
-
-    sample_sizes = [int(i) for i in args.n_samples.split(",")]
-    s = SampleOnly(args.buildstock_dir, args.output_dir)
-    for i in sample_sizes:
-        print(f"Creating {i} samples...")
-        s.run_sampler(args.county, args.PUMA, i)
-
-
-if __name__ == "__main__":
-    main()