forked from NREL/buildstockbatch
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add script to sample a single location (#70)
- Loading branch information
Showing
1 changed file
with
215 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
"""Runs the residental quota sampler for a single county+PUMA. | ||
Usage: | ||
python3 sample_one_county.py --help | ||
python3 sample_one_county.py G1900030 G19001800 100,200 path/to/resstock path/to/output_dir | ||
- Generates two files where every building has county=G1900030 and PUMA=G19001800: | ||
path/to/output_dir/buildstock_G1900030_G19001800_100.csv with 100 samples | ||
path/to/output_dir/buildstock_G1900030_G19001800_200.csv with 200 samples | ||
Methodology: | ||
This modifies the conditional probability distributions from the standard ResStock national project | ||
to create a sample limited to a single county+PUMA. (For example, the selected location may normally | ||
be used for 1% of buildings in a national sample, but we update it to get 100% of buildings while | ||
every other location gets 0%.) | ||
To do this, we modify two files: | ||
- ASHRAE IECC Climate Zone 2004.tsv | ||
- Make 100% of the samples fall into the climate zone of the selected location. | ||
- County and PUMA.tsv | ||
- Make 100% of samples (within the chosen climate zone) fall into the selected county + PUMA | ||
All other housing characteristics are downstream of these (or don't depend on them) and are unchanged. | ||
Assumptions: | ||
This logic is only guaranteed to work for the current ResStock national project. Other changes | ||
to the dependencies between the variables can break it! | ||
In particular, this code assumes: | ||
- ASHRAE climate zone has no dependencies | ||
- County and PUMA depends only on the ASHRAE climate zone | ||
- Each County+PUMA fall entirely in one climate zone | ||
""" | ||
import argparse | ||
import csv | ||
import os | ||
import shutil | ||
import tempfile | ||
|
||
from buildstockbatch.utils import ContainerRuntime | ||
from sampler import residential_quota | ||
|
||
|
||
class SampleOnly: | ||
CONTAINER_RUNTIME = ContainerRuntime.DOCKER | ||
|
||
def __init__(self, buildstock_dir, output_dir): | ||
# Sampler uses this to find the sampling scripts | ||
self.buildstock_dir = os.path.abspath(buildstock_dir) | ||
|
||
# ResStock national project. Could use a different project, but `County and PUMA.tsv` and | ||
# `ASHRAE IECC Climate Zone 2004.tsv` must exist in the expected format. | ||
self.project_dir = os.path.join(self.buildstock_dir, "project_national") | ||
|
||
# Directory containing the conditional probability distributions we plan to modify | ||
self.housing_characteristics_dir = os.path.join(self.project_dir, "housing_characteristics") | ||
self.output_dir = output_dir | ||
os.makedirs(output_dir, exist_ok=True) | ||
|
||
@property | ||
def docker_image(self): | ||
return "nrel/openstudio:{}".format(self.os_version) | ||
|
||
@property | ||
def os_version(self): | ||
return "3.7.0" | ||
|
||
@property | ||
def project_filename(self): | ||
"""Sampler expects this property to exist, but it can be None.""" | ||
return None | ||
|
||
def get_climate_zone(self, county, PUMA): | ||
"""Given a county and PUMA, find the climate zone that contains them. | ||
:param county: GISJOIN ID of county (e.g. "G1900030") | ||
:param PUMA: GISJOIN ID of PUMA (e.g. "G19001800") | ||
:return: Climate zone string (e.g. "3A") | ||
""" | ||
with open(os.path.join(self.housing_characteristics_dir, "County and PUMA.tsv")) as f: | ||
reader = csv.reader(f, delimiter="\t") | ||
headers = next(reader) | ||
# Index of the column with the county and PUMA we're looking for. | ||
try: | ||
location_col = headers.index(f"Option={county}, {PUMA}") | ||
except ValueError as e: | ||
raise ValueError(f"Could not find 'Option={county}, {PUMA}' column in 'County and PUMA.tsv'") from e | ||
|
||
zone = None | ||
for row in reader: | ||
# Skip comments | ||
if row[0].strip()[0] == "#": | ||
continue | ||
|
||
# Find the zone with a non-zero chance of producing this county + PUMA | ||
if row[location_col] != "0": | ||
if zone: | ||
raise ValueError(f"Found multiple climate zones for {county}, {PUMA}") | ||
zone = row[0] | ||
|
||
if not zone: | ||
raise ValueError(f"No climate zone found for {county}, {PUMA}") | ||
return zone | ||
|
||
def run_sampler(self, county, PUMA, n_samples): | ||
""" | ||
Create the requested number of buildings, all contained in the given county and PUMA. | ||
This function: | ||
- Updates the conditional probability distributions for climate zone and county + PUMA. | ||
- Runs the ResidentialQuotaSampler. | ||
- Renames and copies the resulting building.csv file into the output directory. | ||
:param county: GISJOIN ID of county (e.g. "G1900030") | ||
:param PUMA: GISJOIN ID of PUMA (e.g. "G19001800") | ||
:param n_samples: Number of building samples to produce. | ||
""" | ||
|
||
climate_zone = self.get_climate_zone(county, PUMA) | ||
# Create a new copy of the probability distribution TSV files, so we can change them without | ||
# affecting the originals. | ||
with tempfile.TemporaryDirectory(prefix="sampling_", dir=self.buildstock_dir) as tmpdir: | ||
temp_housing_characteristics_dir = os.path.join(tmpdir, "housing_characteristics") | ||
shutil.copytree(self.housing_characteristics_dir, temp_housing_characteristics_dir) | ||
|
||
# Update climate zone TSV | ||
climate_zone_filename = "ASHRAE IECC Climate Zone 2004.tsv" | ||
zone_tsv = os.path.join(self.housing_characteristics_dir, climate_zone_filename) | ||
new_zone_tsv = os.path.join(temp_housing_characteristics_dir, climate_zone_filename) | ||
with open(zone_tsv) as old_f: | ||
reader = csv.reader(old_f, delimiter="\t") | ||
with open(new_zone_tsv, "w") as new_f: | ||
writer = csv.writer(new_f, delimiter="\t") | ||
headers = next(reader) | ||
writer.writerow(headers) | ||
|
||
# This file has a single row of probabilities, which we replace with 0s and a single 1. | ||
zone_header = f"Option={climate_zone}" | ||
writer.writerow(["1" if header == zone_header else "0" for header in headers]) | ||
|
||
# Update county + PUMA TSV | ||
county_filename = "County and PUMA.tsv" | ||
county_tsv = os.path.join(self.housing_characteristics_dir, county_filename) | ||
new_county_tsv = os.path.join(temp_housing_characteristics_dir, county_filename) | ||
with open(county_tsv) as old_f: | ||
reader = csv.reader(old_f, delimiter="\t") | ||
with open(new_county_tsv, "w") as new_f: | ||
writer = csv.writer(new_f, delimiter="\t") | ||
headers = next(reader) | ||
writer.writerow(headers) | ||
|
||
# First value in headers lists the climate zone dependency - | ||
# just use the others, which list the County+PUMA options. | ||
assert headers[0] == "Dependency=ASHRAE IECC Climate Zone 2004" | ||
headers = headers[1:] | ||
for row in reader: | ||
# Skip comments | ||
if row[0].strip()[0] == "#": | ||
continue | ||
|
||
elif row[0] == climate_zone: | ||
# Replace probabilities with 1 for our selected location and 0s everywhere else. | ||
county_header = f"Option={county}, {PUMA}" | ||
writer.writerow( | ||
[row[0]] + ["1" if headers[i] == county_header else "0" for i, v in enumerate(row[1:])] | ||
) | ||
|
||
else: | ||
# Leave other climate zones unchanged - they won't be used anyway. | ||
writer.writerow(row) | ||
|
||
self.cfg = {"project_directory": os.path.basename(tmpdir)} | ||
self.project_dir = tmpdir | ||
|
||
# Note: Must create sampler after all instances vars exist, because it makes a copy of this object. | ||
sampler = residential_quota.ResidentialQuotaSampler(self, n_samples) | ||
sampler.run_sampling() | ||
|
||
# Copy results from temp dir to output dir | ||
shutil.copy( | ||
os.path.join(temp_housing_characteristics_dir, "buildstock.csv"), | ||
os.path.join(self.output_dir, f"buildstock_{county}_{PUMA}_{n_samples}.csv"), | ||
) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("county", help="County GISJOIN ID - https://www.nhgis.org/geographic-crosswalks#geog-ids") | ||
parser.add_argument("PUMA", help="PUMA GISJOIN ID") | ||
parser.add_argument("n_samples", help="Comma-separated list of samples sizes to generate") | ||
parser.add_argument("buildstock_dir", help="Path to the ResStock directory (expected to contain project_national)") | ||
parser.add_argument( | ||
"output_dir", | ||
default=".", | ||
nargs="?", | ||
help="Optional path where output should be written. Defaults to the current directory.", | ||
) | ||
args = parser.parse_args() | ||
|
||
assert ( | ||
len(args.county) == 8 and args.county[0] == "G" | ||
), "County should be 8 chars and start with G (e.g. 'G0100010')" | ||
assert len(args.PUMA) == 9 and args.PUMA[0] == "G", "PUMA should be 9 chars and start with G (e.g. 'G01002100')" | ||
|
||
sample_sizes = [int(i) for i in args.n_samples.split(",")] | ||
s = SampleOnly(args.buildstock_dir, args.output_dir) | ||
for i in sample_sizes: | ||
print(f"Creating {i} samples...") | ||
s.run_sampler(args.county, args.PUMA, i) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |