From 92fa2ac71137e30629dc443157bdc1da66c086d8 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Fri, 18 Nov 2022 22:47:09 +0000 Subject: [PATCH 01/53] adding user defined tags to aws objects --- buildstockbatch/aws/aws.py | 90 +++++++++++-------------------- buildstockbatch/schemas/v0.3.yaml | 1 + 2 files changed, 33 insertions(+), 58 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index fe1999bf..4e90daa3 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -132,6 +132,11 @@ def __repr__(self): return super().__repr__() + def get_tags(self, **kwargs): + tags = kwargs.copy() + tags.update(self.aws_config.get('tags', {})) + return [{'Key': k, 'Value': v} for k, v in tags.items()] + def create_emr_lambda_roles(self): """ Create supporting IAM roles for Lambda support. @@ -224,12 +229,7 @@ def create_vpc(self): Resources=[ self.vpc_id ], - Tags=[ - { - 'Key': 'Name', - 'Value': self.job_identifier - } - ] + Tags=self.get_tags(Name=self.job_identifier) ) break except Exception as e: @@ -300,24 +300,14 @@ def create_vpc(self): Resources=[ self.priv_vpc_subnet_id_1 ], - Tags=[ - { - 'Key': 'Name', - 'Value': self.job_identifier - } - ] + Tags=self.get_tags(Name=self.job_identifier) ) self.ec2.create_tags( Resources=[ self.priv_vpc_subnet_id_2 ], - Tags=[ - { - 'Key': 'Name', - 'Value': self.job_identifier - } - ] + Tags=self.get_tags(Name=self.job_identifier) ) ig_response = self.ec2.create_internet_gateway() @@ -328,12 +318,7 @@ def create_vpc(self): Resources=[ self.internet_gateway_id ], - Tags=[ - { - 'Key': 'Name', - 'Value': self.job_identifier - } - ] + Tags=self.get_tags(Name=self.job_identifier) ) logger.info(f'Internet gateway {self.internet_gateway_id} created.') @@ -353,12 +338,7 @@ def create_vpc(self): Resources=[ self.pub_vpc_subnet_id ], - Tags=[ - { - 'Key': 'Name', - 'Value': self.job_identifier - } - ] + Tags=self.get_tags(Name=self.job_identifier) ) # Create and elastic IP for the NAT Gateway @@ -377,12 +357,7 @@ def create_vpc(self): Resources=[ self.nat_ip_allocation ], - Tags=[ - { - 'Key': 'Name', - 'Value': self.job_identifier - } - ] + Tags=self.get_tags(Name=self.job_identifier) ) except Exception as e: @@ -457,12 +432,7 @@ def create_vpc(self): Resources=[ self.priv_route_table_id ], - Tags=[ - { - 'Key': 'Name', - 'Value': self.job_identifier - } - ] + Tags=self.get_tags(Name=self.job_identifier) ) # Associate the private route to the private subnet @@ -687,12 +657,18 @@ def create_compute_environment(self, maxCPUs=10000): else: compute_resources['type'] = 'EC2' + compute_resources['tags'] = self.aws_config.get('tags', {}).copy() + compute_resources['tags']['Name'] = f"{self.job_identifier} batch instance" + compute_env_tags = compute_resources['tags'].copy() + del compute_env_tags['Name'] + self.batch.create_compute_environment( computeEnvironmentName=self.batch_compute_environment_name, type='MANAGED', state='ENABLED', computeResources=compute_resources, - serviceRole=self.service_role_arn + serviceRole=self.service_role_arn, + tags=compute_env_tags ) logger.info(f'Compute environment {self.batch_compute_environment_name} created.') @@ -720,7 +696,8 @@ def create_job_queue(self): 'order': 1, 'computeEnvironment': self.batch_compute_environment_name }, - ] + ], + tags=self.aws_config.get('tags', {}).copy() ) # print("JOB QUEUE") @@ -743,7 +720,7 @@ def create_job_queue(self): elif 'is not valid' in str(e): # Need to wait a second for the compute environment to complete registration logger.warning( - '5 second sleep initiated to wait for compute environment creation due to error: ' + str(e)) + 'wating a few seconds for compute environment creation: ' + str(e)) time.sleep(5) else: @@ -774,7 +751,8 @@ def create_job_definition(self, docker_image, vcpus, memory, command, env_vars): }, retryStrategy={ 'attempts': 2 - } + }, + tags=self.aws_config.get('tags', {}).copy() ) self.job_definition_arn = response['jobDefinitionArn'] @@ -792,7 +770,8 @@ def submit_job(self, array_size=4): arrayProperties={ 'size': array_size }, - jobDefinition=self.job_definition_arn + jobDefinition=self.job_definition_arn, + tags=self.aws_config.get('tags', {}).copy() ) logger.info(f"Job {self.job_identifier} submitted.") @@ -960,7 +939,8 @@ def create_state_machine(self): response = self.step_functions.create_state_machine( name=self.state_machine_name, definition=job_definition, - roleArn=self.state_machine_role_arn + roleArn=self.state_machine_role_arn, + tags=self.get_tags() ) # print(response) @@ -1550,12 +1530,7 @@ def create_emr_cluster_function(self): VisibleToAllUsers=True, JobFlowRole=self.emr_instance_profile_name, ServiceRole=self.emr_service_role_name, - Tags=[ - { - 'Key': 'org', - 'Value': 'ops' - }, - ], + Tags=self.get_tags(), AutoScalingRole='EMR_AutoScaling_DefaultRole', ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION', EbsRootVolumeSize=100 @@ -1600,9 +1575,7 @@ def create_emr_cluster_function(self): 'EMR_CONFIG_JSON_KEY': self.s3_lambda_emr_config_key } }, - Tags={ - 'job': self.job_identifier - } + Tags=self.get_tags(job=self.job_identifier) ) logger.info(f"Lambda function {self.lambda_emr_job_step_function_name} created.") @@ -1632,7 +1605,8 @@ def __init__(self, job_name, aws_config, boto3_session): def create_topic(self): response = self.sns.create_topic( - Name=self.sns_state_machine_topic + Name=self.sns_state_machine_topic, + Tags=self.get_tags() ) logger.info(f"Simple notifications topic {self.sns_state_machine_topic} created.") diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml index 726248e8..df80debc 100644 --- a/buildstockbatch/schemas/v0.3.yaml +++ b/buildstockbatch/schemas/v0.3.yaml @@ -25,6 +25,7 @@ aws-spec: notifications_email: regex('^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', name='email', required=True) emr: include('aws-emr-spec', required=False) job_environment: include('aws-job-environment', required=False) + tags: map(str(), str(), required=False) aws-job-environment: vcpus: int(min=1, max=36, required=False) From 3840e2f4db0f90ace4ad6f305f1281c2e19a8431 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 21 Nov 2022 19:07:47 +0000 Subject: [PATCH 02/53] update and simplify dockerfile --- Dockerfile | 25 ++++--------------------- setup.py | 1 - 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/Dockerfile b/Dockerfile index e7938c65..9c8b1c80 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,24 +1,7 @@ -FROM nrel/openstudio:2.9.1 +ARG OS_VER=3.5.0 +FROM nrel/openstudio:$OS_VER -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 - -RUN sudo apt update && \ - sudo apt install -y wget build-essential checkinstall libreadline-gplv2-dev libncursesw5-dev libssl-dev \ - libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev zlib1g-dev - -RUN wget https://www.python.org/ftp/python/3.8.8/Python-3.8.8.tgz && \ - tar xzf Python-3.8.8.tgz && \ - cd Python-3.8.8 && \ - ./configure --enable-optimizations && \ - make altinstall && \ - rm -rf Python-3.8.8 && \ - rm -rf Python-3.8.8.tgz - -RUN sudo apt install -y -V ca-certificates lsb-release && \ - wget https://apache.bintray.com/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-archive-keyring-latest-$(lsb_release --codename --short).deb && \ - sudo apt install -y -V ./apache-arrow-archive-keyring-latest-$(lsb_release --codename --short).deb && \ - sudo apt update && \ - sudo apt install -y -V libarrow-dev libarrow-glib-dev libarrow-dataset-dev libparquet-dev libparquet-glib-dev +RUN sudo apt update && sudo apt install -y python3-pip COPY . /buildstock-batch/ -RUN python3.8 -m pip install /buildstock-batch +RUN python3 -m pip install /buildstock-batch diff --git a/setup.py b/setup.py index 361eba79..7acc59f7 100644 --- a/setup.py +++ b/setup.py @@ -101,7 +101,6 @@ def run_tests(self): 'License :: OSI Approved :: BSD License', 'Natural Language :: English', 'Programming Language :: Python', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', From c4fe7eb590b0dd95b66ec981c6351dfb038e9a4a Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 21 Nov 2022 22:03:00 +0000 Subject: [PATCH 03/53] fixing some tagging things --- buildstockbatch/aws/aws.py | 38 ++++++++++++++-------------------- buildstockbatch/aws/awsbase.py | 13 ++++++++++++ 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 4e90daa3..9321fc22 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -132,11 +132,6 @@ def __repr__(self): return super().__repr__() - def get_tags(self, **kwargs): - tags = kwargs.copy() - tags.update(self.aws_config.get('tags', {})) - return [{'Key': k, 'Value': v} for k, v in tags.items()] - def create_emr_lambda_roles(self): """ Create supporting IAM roles for Lambda support. @@ -229,7 +224,7 @@ def create_vpc(self): Resources=[ self.vpc_id ], - Tags=self.get_tags(Name=self.job_identifier) + Tags=self.get_tags_uppercase(Name=self.job_identifier) ) break except Exception as e: @@ -300,14 +295,14 @@ def create_vpc(self): Resources=[ self.priv_vpc_subnet_id_1 ], - Tags=self.get_tags(Name=self.job_identifier) + Tags=self.get_tags_uppercase(Name=self.job_identifier) ) self.ec2.create_tags( Resources=[ self.priv_vpc_subnet_id_2 ], - Tags=self.get_tags(Name=self.job_identifier) + Tags=self.get_tags_uppercase(Name=self.job_identifier) ) ig_response = self.ec2.create_internet_gateway() @@ -318,7 +313,7 @@ def create_vpc(self): Resources=[ self.internet_gateway_id ], - Tags=self.get_tags(Name=self.job_identifier) + Tags=self.get_tags_uppercase(Name=self.job_identifier) ) logger.info(f'Internet gateway {self.internet_gateway_id} created.') @@ -338,7 +333,7 @@ def create_vpc(self): Resources=[ self.pub_vpc_subnet_id ], - Tags=self.get_tags(Name=self.job_identifier) + Tags=self.get_tags_uppercase(Name=self.job_identifier) ) # Create and elastic IP for the NAT Gateway @@ -357,7 +352,7 @@ def create_vpc(self): Resources=[ self.nat_ip_allocation ], - Tags=self.get_tags(Name=self.job_identifier) + Tags=self.get_tags_uppercase(Name=self.job_identifier) ) except Exception as e: @@ -432,7 +427,7 @@ def create_vpc(self): Resources=[ self.priv_route_table_id ], - Tags=self.get_tags(Name=self.job_identifier) + Tags=self.get_tags_uppercase(Name=self.job_identifier) ) # Associate the private route to the private subnet @@ -657,10 +652,7 @@ def create_compute_environment(self, maxCPUs=10000): else: compute_resources['type'] = 'EC2' - compute_resources['tags'] = self.aws_config.get('tags', {}).copy() - compute_resources['tags']['Name'] = f"{self.job_identifier} batch instance" - compute_env_tags = compute_resources['tags'].copy() - del compute_env_tags['Name'] + compute_resources['tags'] = self.get_tags(Name=f"{self.job_identifier} batch instance") self.batch.create_compute_environment( computeEnvironmentName=self.batch_compute_environment_name, @@ -668,7 +660,7 @@ def create_compute_environment(self, maxCPUs=10000): state='ENABLED', computeResources=compute_resources, serviceRole=self.service_role_arn, - tags=compute_env_tags + tags=self.get_tags() ) logger.info(f'Compute environment {self.batch_compute_environment_name} created.') @@ -697,7 +689,7 @@ def create_job_queue(self): 'computeEnvironment': self.batch_compute_environment_name }, ], - tags=self.aws_config.get('tags', {}).copy() + tags=self.get_tags() ) # print("JOB QUEUE") @@ -752,7 +744,7 @@ def create_job_definition(self, docker_image, vcpus, memory, command, env_vars): retryStrategy={ 'attempts': 2 }, - tags=self.aws_config.get('tags', {}).copy() + tags=self.get_tags() ) self.job_definition_arn = response['jobDefinitionArn'] @@ -771,7 +763,7 @@ def submit_job(self, array_size=4): 'size': array_size }, jobDefinition=self.job_definition_arn, - tags=self.aws_config.get('tags', {}).copy() + tags=self.get_tags() ) logger.info(f"Job {self.job_identifier} submitted.") @@ -940,7 +932,7 @@ def create_state_machine(self): name=self.state_machine_name, definition=job_definition, roleArn=self.state_machine_role_arn, - tags=self.get_tags() + tags=self.get_tags_lowercase() ) # print(response) @@ -1530,7 +1522,7 @@ def create_emr_cluster_function(self): VisibleToAllUsers=True, JobFlowRole=self.emr_instance_profile_name, ServiceRole=self.emr_service_role_name, - Tags=self.get_tags(), + Tags=self.get_tags_uppercase(), AutoScalingRole='EMR_AutoScaling_DefaultRole', ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION', EbsRootVolumeSize=100 @@ -1606,7 +1598,7 @@ def __init__(self, job_name, aws_config, boto3_session): def create_topic(self): response = self.sns.create_topic( Name=self.sns_state_machine_topic, - Tags=self.get_tags() + Tags=self.get_tags_uppercase() ) logger.info(f"Simple notifications topic {self.sns_state_machine_topic} created.") diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index 8884e761..d242e220 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -222,6 +222,19 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.priv_vpc_subnet_id_1 = 'REPL' # will be available after VPC creation self.priv_vpc_subnet_id_2 = 'REPL' # will be available after VPC creation + def get_tags(self, **kwargs): + tags = kwargs.copy() + tags.update(self.aws_config.get('tags', {})) + return tags + + def get_tags_uppercase(self, **kwargs): + tags = self.get_tags(**kwargs) + return [{'Key': k, 'Value': v} for k, v in tags.items()] + + def get_tags_lowercase(self, _caps=True, **kwargs): + tags = self.get_tags(**kwargs) + return [{'key': k, 'value': v} for k, v in tags.items()] + def __repr__(self): return f""" From 42e9938b0406a5b341c289e0a8c3e3cfd69edabe Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 28 Nov 2022 15:36:08 -0700 Subject: [PATCH 04/53] setting platform in Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9c8b1c80..ec22b92a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ ARG OS_VER=3.5.0 -FROM nrel/openstudio:$OS_VER +FROM --platform=linux/amd64 nrel/openstudio:$OS_VER RUN sudo apt update && sudo apt install -y python3-pip From c31b78d6db3e3a5ef6fe6d49db59d681584d0e95 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Wed, 30 Nov 2022 16:52:48 -0700 Subject: [PATCH 05/53] fixes in run_batch() --- buildstockbatch/aws/aws.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 9321fc22..923f0d67 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -1699,7 +1699,8 @@ def build_image(self): self.docker_client.images.build( path=str(root_path), tag=self.docker_image, - rm=True + rm=True, + platform="linux/amd64" ) def push_image(self): @@ -1898,7 +1899,7 @@ def run_batch(self): job_env_cfg = self.cfg['aws'].get('job_environment', {}) batch_env.create_job_definition( image_url, - command=['python3.8', '-m', 'buildstockbatch.aws.aws'], + command=['python3', '-m', 'buildstockbatch.aws.aws'], vcpus=job_env_cfg.get('vcpus', 1), memory=job_env_cfg.get('memory', 1024), env_vars=env_vars @@ -1974,7 +1975,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): if sum(row_has_epw): if row[0] != param_name and param_name is not None: raise RuntimeError(f'The epw files are specified in options_lookup.tsv under more than one parameter type: {param_name}, {row[0]}') # noqa: E501 - epw_filename = row[row_has_epw.index(True) + 2].split('=')[1] + epw_filename = row[row_has_epw.index(True) + 2].split('=')[1].split('/')[-1] param_name = row[0] option_name = row[1] epws_by_option[option_name] = epw_filename From 494aa5946bd94a1c645d107c74f538db96b78f9d Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 20 Dec 2022 11:36:24 -0700 Subject: [PATCH 06/53] adding a launch template --- buildstockbatch/aws/aws.py | 52 +++++++++++++++++++++++++++++++++- buildstockbatch/aws/awsbase.py | 2 +- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 923f0d67..4cbde4ca 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -629,6 +629,43 @@ def create_compute_environment(self, maxCPUs=10000): """ + logger.debug(f"Creating launch template {self.launch_template_name}") + self.ec2.create_launch_template( + LaunchTemplateName=self.launch_template_name, + LaunchTemplateData={ + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/xvda", + "Ebs": { + "VolumeSize": 100, + "VolumeType": "gp2" + } + } + ] + } + ) + + while True: + lt_resp = self.ec2.describe_launch_templates( + LaunchTemplateNames=[self.launch_template_name] + ) + launch_templates = lt_resp["LaunchTemplates"] + next_token = lt_resp.get("NextToken") + while next_token: + lt_resp = self.ec2.describe_launch_templates( + LaunchTemplateNames=[self.launch_template_name], + NextToken=next_token + ) + launch_templates.extend(lt_resp["LaunchTemplates"]) + next_token = lt_resp.get("NextToken") + n_launch_templates = len(launch_templates) + assert n_launch_templates <= 1, f"There are {n_launch_templates} launch templates, this shouldn't happen." + if n_launch_templates == 0: + logger.debug(f"Waiting for the launch template {self.launch_template_name} to be created") + time.sleep(5) + if n_launch_templates == 1: + break + try: compute_resources = { 'minvCpus': 0, @@ -637,7 +674,9 @@ def create_compute_environment(self, maxCPUs=10000): 'instanceTypes': [ 'optimal', ], - 'imageId': self.batch_compute_environment_ami, + 'LaunchTemplate': { + 'LaunchTemplateName': self.launch_template_name, + }, 'subnets': [self.priv_vpc_subnet_id_1, self.priv_vpc_subnet_id_2], 'securityGroupIds': [self.batch_security_group], 'instanceRole': self.instance_profile_arn @@ -1147,6 +1186,17 @@ def clean(self): else: raise + # Delete Launch Template + try: + self.ec2.delete_launch_template( + LaunchTemplateName=self.launch_template_name + ) + except Exception as e: + if 'does not exist' in str(e): + logger.info(f"Launch template {self.launch_template_name} does not exist, skipping...") + else: + raise + self.iam_helper.delete_role(self.batch_service_role_name) self.iam_helper.delete_role(self.batch_spot_service_role_name) self.iam_helper.delete_role(self.batch_ecs_task_role_name) diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index d242e220..88274c4a 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -197,7 +197,7 @@ def __init__(self, job_identifier, aws_config, boto3_session): # Batch self.batch_compute_environment_name = f"computeenvionment_{self.job_identifier}" - self.batch_compute_environment_ami = 'ami-0184013939261b626' + self.launch_template_name = f"launch_templ_{self.job_identifier}" self.batch_job_queue_name = f"job_queue_{self.job_identifier}" self.batch_service_role_name = f"batch_service_role_{self.job_identifier}" self.batch_instance_role_name = f"batch_instance_role_{self.job_identifier}" From 74e7d200ba4b611fa0207b620421a0e4b2bfe3f2 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 20 Dec 2022 23:49:02 +0000 Subject: [PATCH 07/53] getting right case on launch templates --- buildstockbatch/aws/aws.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 4cbde4ca..9ef487e4 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -674,8 +674,8 @@ def create_compute_environment(self, maxCPUs=10000): 'instanceTypes': [ 'optimal', ], - 'LaunchTemplate': { - 'LaunchTemplateName': self.launch_template_name, + 'launchTemplate': { + 'launchTemplateName': self.launch_template_name, }, 'subnets': [self.priv_vpc_subnet_id_1, self.priv_vpc_subnet_id_2], 'securityGroupIds': [self.batch_security_group], @@ -2049,7 +2049,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): f_out.write(gzip.decompress(f_gz.getvalue())) asset_dirs = os.listdir(sim_dir) - fs = S3FileSystem() + fs = S3FileSystem(key="AKIAV3NSSDJSQVOX224C", secret="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") local_fs = LocalFileSystem() reporting_measures = cls.get_reporting_measures(cfg) dpouts = [] From 9ad55a408d0216393bb4a6b827099ab078e6deb9 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Wed, 21 Dec 2022 16:52:02 +0000 Subject: [PATCH 08/53] removing keys --- buildstockbatch/aws/aws.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 9ef487e4..628a8972 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -2049,7 +2049,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): f_out.write(gzip.decompress(f_gz.getvalue())) asset_dirs = os.listdir(sim_dir) - fs = S3FileSystem(key="AKIAV3NSSDJSQVOX224C", secret="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") + fs = S3FileSystem() local_fs = LocalFileSystem() reporting_measures = cls.get_reporting_measures(cfg) dpouts = [] From 4f5cf59b1341108895dae0c554a9376105a80d4e Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 22 Dec 2022 18:13:18 +0000 Subject: [PATCH 09/53] removing awsretry dependency --- Dockerfile | 2 +- buildstockbatch/aws/aws.py | 79 ++++++++++++++++++++++------------ buildstockbatch/aws/awsbase.py | 14 ++++-- setup.py | 1 - 4 files changed, 64 insertions(+), 32 deletions(-) diff --git a/Dockerfile b/Dockerfile index ec22b92a..3e2b448f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,6 @@ ARG OS_VER=3.5.0 FROM --platform=linux/amd64 nrel/openstudio:$OS_VER RUN sudo apt update && sudo apt install -y python3-pip - +RUN sudo -H pip install --upgrade pip COPY . /buildstock-batch/ RUN python3 -m pip install /buildstock-batch diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 628a8972..f9755cad 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -10,7 +10,6 @@ :license: BSD-3 """ import argparse -from awsretry import AWSRetry import base64 import boto3 from botocore.exceptions import ClientError @@ -40,15 +39,42 @@ from buildstockbatch.localdocker import DockerBatchBase from buildstockbatch.base import ValidationError -from buildstockbatch.aws.awsbase import AwsJobBase +from buildstockbatch.aws.awsbase import AwsJobBase, boto_client_config from buildstockbatch import postprocessing from ..utils import log_error_details, get_project_configuration logger = logging.getLogger(__name__) +def backoff(thefunc, *args, **kwargs): + backoff_mult = 1.1 + delay = 3 + tries = 5 + error_patterns = [ + r"\w+.NotFound" + ] + while tries > 0: + try: + result = thefunc(*args, **kwargs) + except ClientError as error: + error_code = error.response["Error"]["Code"] + caught_error = False + for pat in error_patterns: + if re.search(pat, error_code): + logger.debug(f"{error_code}: Waiting and retrying in {delay} seconds") + caught_error = True + time.sleep(delay) + delay *= backoff_mult + tries -= 1 + break + if not caught_error: + raise error + else: + return result + + def upload_file_to_s3(*args, **kwargs): - s3 = boto3.client('s3') + s3 = boto3.client('s3', config=boto_client_config) s3.upload_file(*args, **kwargs) @@ -88,7 +114,7 @@ def calc_hash_for_file(filename): def copy_s3_file(src_bucket, src_key, dest_bucket, dest_key): - s3 = boto3.client('s3') + s3 = boto3.client('s3', config=boto_client_config) s3.copy( {'Bucket': src_bucket, 'Key': src_key}, dest_bucket, @@ -109,14 +135,14 @@ def __init__(self, job_name, aws_config, boto3_session): """ super().__init__(job_name, aws_config, boto3_session) - self.batch = self.session.client('batch') - self.ec2 = self.session.client('ec2') - self.ec2r = self.session.resource('ec2') - self.emr = self.session.client('emr') - self.step_functions = self.session.client('stepfunctions') - self.aws_lambda = self.session.client('lambda') - self.s3 = self.session.client('s3') - self.s3_res = self.session.resource('s3') + self.batch = self.session.client('batch', config=boto_client_config) + self.ec2 = self.session.client('ec2', config=boto_client_config) + self.ec2r = self.session.resource('ec2', config=boto_client_config) + self.emr = self.session.client('emr', config=boto_client_config) + self.step_functions = self.session.client('stepfunctions', config=boto_client_config) + self.aws_lambda = self.session.client('lambda', config=boto_client_config) + self.s3 = self.session.client('s3', config=boto_client_config) + self.s3_res = self.session.resource('s3', config=boto_client_config) self.task_role_arn = None self.job_definition_arn = None @@ -193,7 +219,7 @@ def create_emr_lambda_roles(self): def create_vpc(self): cidrs_in_use = set() - vpc_response = AWSRetry.backoff()(self.ec2.describe_vpcs)() + vpc_response = self.ec2.describe_vpcs() for vpc in vpc_response['Vpcs']: cidrs_in_use.add(vpc['CidrBlock']) for cidr_assoc in vpc['CidrBlockAssociationSet']: @@ -309,7 +335,7 @@ def create_vpc(self): self.internet_gateway_id = ig_response['InternetGateway']['InternetGatewayId'] - AWSRetry.backoff()(self.ec2.create_tags)( + backoff(self.ec2.create_tags, Resources=[ self.internet_gateway_id ], @@ -423,7 +449,7 @@ def create_vpc(self): logger.info("Route table created.") - AWSRetry.backoff()(self.ec2.create_tags)( + backoff(self.ec2.create_tags, Resources=[ self.priv_route_table_id ], @@ -1062,7 +1088,7 @@ def clean(self): if len(dsg.ip_permissions_egress): response = dsg.revoke_egress(IpPermissions=dsg.ip_permissions_egress) - sg_response = AWSRetry.backoff()(self.ec2.describe_security_groups)( + sg_response = self.ec2.describe_security_groups( Filters=[ { 'Name': 'group-name', @@ -1207,7 +1233,7 @@ def clean(self): # Find Nat Gateways and VPCs - response = AWSRetry.backoff()(self.ec2.describe_vpcs)( + response = self.ec2.describe_vpcs( Filters=[ { 'Name': 'tag:Name', @@ -1216,13 +1242,12 @@ def clean(self): ] }, ], - ) for vpc in response['Vpcs']: this_vpc = vpc['VpcId'] - ng_response = AWSRetry.backoff()(self.ec2.describe_nat_gateways)( + ng_response = self.ec2.describe_nat_gateways( Filters=[ { 'Name': 'vpc-id', @@ -1241,7 +1266,7 @@ def clean(self): NatGatewayId=this_natgw ) - rtas_response = AWSRetry.backoff()(self.ec2.describe_route_tables)( + rtas_response = self.ec2.describe_route_tables( Filters=[ { 'Name': 'vpc-id', @@ -1276,7 +1301,7 @@ def clean(self): else: raise - igw_response = AWSRetry.backoff()(self.ec2.describe_internet_gateways)( + igw_response = self.ec2.describe_internet_gateways( Filters=[ { 'Name': 'tag:Name', @@ -1339,7 +1364,7 @@ def clean(self): else: raise - AWSRetry.backoff()(self.ec2.delete_vpc)( + self.ec2.delete_vpc( VpcId=this_vpc ) @@ -1642,7 +1667,7 @@ class AwsSNS(AwsJobBase): def __init__(self, job_name, aws_config, boto3_session): super().__init__(job_name, aws_config, boto3_session) - self.sns = self.session.client("sns") + self.sns = self.session.client("sns", config=boto_client_config) self.sns_state_machine_topic_arn = None def create_topic(self): @@ -1683,8 +1708,8 @@ def __init__(self, project_filename): self.project_filename = project_filename self.region = self.cfg['aws']['region'] - self.ecr = boto3.client('ecr', region_name=self.region) - self.s3 = boto3.client('s3', region_name=self.region) + self.ecr = boto3.client('ecr', region_name=self.region, config=boto_client_config) + self.s3 = boto3.client('s3', region_name=self.region, config=boto_client_config) self.s3_bucket = self.cfg['aws']['s3']['bucket'] self.s3_bucket_prefix = self.cfg['aws']['s3']['prefix'].rstrip('/') self.batch_env_use_spot = self.cfg['aws']['use_spot'] @@ -1696,7 +1721,7 @@ def validate_instance_types(project_file): cfg = get_project_configuration(project_file) aws_config = cfg['aws'] boto3_session = boto3.Session(region_name=aws_config['region']) - ec2 = boto3_session.client('ec2') + ec2 = boto3_session.client('ec2', config=boto_client_config) job_base = AwsJobBase('genericjobid', aws_config, boto3_session) instance_types_requested = set() instance_types_requested.add(job_base.emr_manager_instance_type) @@ -1987,7 +2012,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): """ logger.debug(f"region: {region}") - s3 = boto3.client('s3') + s3 = boto3.client('s3', config=boto_client_config) sim_dir = pathlib.Path('/var/simdata/openstudio') diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index 88274c4a..569182e5 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -1,8 +1,16 @@ import logging +from botocore.config import Config logger = logging.getLogger(__name__) +boto_client_config = Config( + retries={ + "max_attempts": 5, + "mode": "standard" + } +) + class AWSIAMHelper(): @@ -14,7 +22,7 @@ def __init__(self, session): :param session: boto3 Session from 'parent' job base class ''' self.session = session - self.iam = self.session.client('iam') + self.iam = self.session.client('iam', config=boto_client_config) def role_stitcher(self, role_name, trust_service, description, policies_list=[], managed_policie_arns=[]): ''' @@ -161,9 +169,9 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.session = boto3_session self.iam_helper = AWSIAMHelper(self.session) self.iam = self.iam_helper.iam - self.s3 = self.session.client('s3') + self.s3 = self.session.client('s3', config=boto_client_config) self.job_identifier = job_identifier - self.account = self.session.client('sts').get_caller_identity().get('Account') + self.account = self.session.client('sts', config=boto_client_config).get_caller_identity().get('Account') self.region = aws_config['region'] self.operator_email = aws_config['notifications_email'] diff --git a/setup.py b/setup.py index 86ddcb88..c7c2716b 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,6 @@ def run_tests(self): 'fsspec', 'yamale', 'ruamel.yaml', - 'awsretry', 'lxml' ], extras_require={ From d1c56e9eeaabcfbdde7c9246261398fb52040236 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Fri, 6 Jan 2023 23:20:46 +0000 Subject: [PATCH 10/53] some unsuccessfull changes to get emr working --- buildstockbatch/aws/aws.py | 66 ++++++++++++++++++- buildstockbatch/aws/awsbase.py | 1 + .../aws/s3_assets/bootstrap-dask-custom | 22 +++---- .../aws/s3_assets/setup_postprocessing.py | 15 ++--- 4 files changed, 83 insertions(+), 21 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index f9755cad..f6c9c16d 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -1432,6 +1432,68 @@ def create_emr_security_groups(self): else: raise + try: + response = self.ec2.create_security_group( + Description='EMR Service Access Security Group', + GroupName=self.emr_service_access_security_group_name, + VpcId=self.vpc_id + ) + except Exception as e: + if 'already exists for VPC' in str(e): + logger.info("Security group for EMR service access already exists, skipping ...") + response = self.ec2.describe_security_groups( + Filters=[ + { + 'Name': 'group-name', + 'Values': [ + self.emr_service_access_security_group_name, + ] + }, + ] + ) + self.emr_service_access_security_group_id = response['SecurityGroups'][0]['GroupId'] + else: + self.emr_service_access_security_group_id = response['GroupId'] + + # See https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html#emr-sg-elasticmapreduce-sa-private + try: + response = self.ec2.authorize_security_group_ingress( + GroupId=self.emr_service_access_security_group_id, + IpPermissions=[dict( + FromPort=9443, + ToPort=9443, + IpProtocol='tcp', + UserIdGroupPairs=[dict( + GroupId=self.emr_cluster_security_group_id, + UserId=self.account + )] + )] + ) + except Exception as e: + if 'already exists' in str(e): + logger.info("Security group ingress rule for EMR already exists, skipping ...") + else: + raise + + try: + response = self.ec2.authorize_security_group_egress( + GroupId=self.emr_service_access_security_group_id, + IpPermissions=[dict( + FromPort=8443, + ToPort=8443, + IpProtocol='tcp', + UserIdGroupPairs=[dict( + GroupId=self.emr_cluster_security_group_id, + UserId=self.account + )] + )] + ) + except Exception as e: + if 'already exists' in str(e): + logger.info("Security group egress rule for EMR already exists, skipping ...") + else: + raise + def create_emr_iam_roles(self): self.emr_service_role_arn = self.iam_helper.role_stitcher( @@ -1543,7 +1605,7 @@ def create_emr_cluster_function(self): Name=self.emr_cluster_name, LogUri=self.emr_log_uri, - ReleaseLabel='emr-5.23.0', + ReleaseLabel='emr-5.36.0', Instances={ 'InstanceGroups': [ { @@ -1563,7 +1625,7 @@ def create_emr_cluster_function(self): 'KeepJobFlowAliveWhenNoSteps': False, 'EmrManagedMasterSecurityGroup': self.emr_cluster_security_group_id, 'EmrManagedSlaveSecurityGroup': self.emr_cluster_security_group_id, - 'ServiceAccessSecurityGroup': self.batch_security_group + 'ServiceAccessSecurityGroup': self.emr_service_access_security_group_id }, Applications=[ diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index 569182e5..61df7488 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -189,6 +189,7 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.emr_worker_instance_type = emr_config.get('worker_instance_type', 'r5.4xlarge') self.emr_worker_instance_count = emr_config.get('worker_instance_count', 4) self.emr_cluster_security_group_name = f'{self.job_identifier}_emr_security_group' + self.emr_service_access_security_group_name = f'{self.job_identifier}_emr_service_access' self.emr_cluster_name = f'{self.job_identifier}_emr_dask_cluster' self.emr_job_flow_role_name = f'{self.job_identifier}_emr_job_flow_role' self.emr_job_flow_role_arn = '' diff --git a/buildstockbatch/aws/s3_assets/bootstrap-dask-custom b/buildstockbatch/aws/s3_assets/bootstrap-dask-custom index 32f82a2d..0e2daa7c 100644 --- a/buildstockbatch/aws/s3_assets/bootstrap-dask-custom +++ b/buildstockbatch/aws/s3_assets/bootstrap-dask-custom @@ -18,7 +18,7 @@ bash /tmp/miniconda.sh -b -p $HOME/miniconda rm /tmp/miniconda.sh echo -e '\nexport PATH=$HOME/miniconda/bin:$PATH' >> $HOME/.bashrc source $HOME/.bashrc -conda update conda -y +conda install mamba -n base -c conda-forge # ----------------------------------------------------------------------------- @@ -34,20 +34,20 @@ conda update conda -y # - conda-pack for packaging the environment for distribution # ----------------------------------------------------------------------------- echo "Installing base packages" -conda install \ +mamba install \ -c conda-forge \ -y \ -q \ -python=3.7 \ -"dask>=2021.5" \ -"distributed>=2021.5" \ +"python=3.10" \ +"dask>=2022.10.0" \ +"distributed>=2022.10.0" \ "dask-yarn>=0.9.0" \ -"pandas>=1.0.0,!=1.0.4" \ -"pyarrow>=3.0.0" \ -"s3fs>=0.4.2,<0.5.0" \ -"numpy>=1.20.0" \ -conda-pack \ -tornado=5 +"pandas" \ +"pyarrow" \ +"numpy" \ +"s3fs" \ +"boto3" \ +"conda-pack" aws s3 cp "$1" $HOME/postprocessing.tar.gz pip install $HOME/postprocessing.tar.gz diff --git a/buildstockbatch/aws/s3_assets/setup_postprocessing.py b/buildstockbatch/aws/s3_assets/setup_postprocessing.py index 9fbbc703..6ff9d50a 100644 --- a/buildstockbatch/aws/s3_assets/setup_postprocessing.py +++ b/buildstockbatch/aws/s3_assets/setup_postprocessing.py @@ -5,12 +5,11 @@ version='0.1', description='Just the stand alone postprocessing functions from Buildstock-Batch', py_modules=['postprocessing'], - install_requires=[ - 'dask[complete]>=2022.10.0', - 's3fs>=0.4.2,<0.5.0', - 'boto3', - 'pandas>=1.0.0,!=1.0.4', - 'pyarrow>=3.0.0', - 'numpy>=1.20.0' - ] + # install_requires=[ + # 'dask[complete]>=2022.10.0', + # 's3fs[boto3]', + # 'pandas', + # 'pyarrow', + # 'numpy' + # ] ) From 6995f1e05d97b9c6de283a650af4da736374abf6 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Wed, 25 Jan 2023 18:39:20 +0000 Subject: [PATCH 11/53] removing emr stuff --- buildstockbatch/aws/aws.py | 541 +-------------------------------- buildstockbatch/aws/awsbase.py | 21 -- 2 files changed, 3 insertions(+), 559 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index f6c9c16d..34f4424a 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -138,7 +138,6 @@ def __init__(self, job_name, aws_config, boto3_session): self.batch = self.session.client('batch', config=boto_client_config) self.ec2 = self.session.client('ec2', config=boto_client_config) self.ec2r = self.session.resource('ec2', config=boto_client_config) - self.emr = self.session.client('emr', config=boto_client_config) self.step_functions = self.session.client('stepfunctions', config=boto_client_config) self.aws_lambda = self.session.client('lambda', config=boto_client_config) self.s3 = self.session.client('s3', config=boto_client_config) @@ -158,65 +157,6 @@ def __repr__(self): return super().__repr__() - def create_emr_lambda_roles(self): - """ - Create supporting IAM roles for Lambda support. - """ - - # EMR - - lambda_policy = { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": "logs:CreateLogGroup", - "Resource": f"arn:aws:logs:{self.region}:{self.account}:*" - }, - { - "Effect": "Allow", - "Action": [ - "logs:CreateLogStream", - "logs:PutLogEvents" - ], - "Resource": [ - f"arn:aws:logs:{self.region}:{self.account}:log-group:/aws/lambda/launchemr:*" - ] - }, - { - "Effect": "Allow", - "Action": "elasticmapreduce:RunJobFlow", - "Resource": "*" - }, - { - - "Effect": "Allow", - "Action": "iam:PassRole", - "Resource": [ - f"arn:aws:iam::{self.account}:role/EMR_DefaultRole", - f"arn:aws:iam::{self.account}:role/EMR_EC2_DefaultRole", - f"arn:aws:iam::{self.account}:role/EMR_AutoScaling_DefaultRole", - self.emr_job_flow_role_arn, - self.emr_service_role_arn - ] - }, - { - "Effect": "Allow", - "Action": "s3:GetObject", - "Resource": [ - f"arn:aws:s3:::{self.s3_bucket}/*" - ] - } - ] - } - - self.lambda_emr_job_step_execution_role_arn = self.iam_helper.role_stitcher( - self.lambda_emr_job_step_execution_role, - 'lambda', - f'Lambda execution role for {self.lambda_emr_job_step_function_name}', - policies_list=[json.dumps(lambda_policy, indent=4)] - ) - def create_vpc(self): cidrs_in_use = set() vpc_response = self.ec2.describe_vpcs() @@ -845,23 +785,6 @@ def submit_job(self, array_size=4): def create_state_machine_roles(self): - lambda_policy = f'''{{ - "Version": "2012-10-17", - "Statement": [ - {{ - "Effect": "Allow", - "Action": [ - "lambda:InvokeFunction" - ], - "Resource": [ - "arn:aws:lambda:*:*:function:{self.lambda_emr_job_step_function_name}" - ] - }} - ] -}} - - ''' - batch_policy = '''{ "Version": "2012-10-17", "Statement": [ @@ -904,7 +827,7 @@ def create_state_machine_roles(self): }} ''' - policies_list = [lambda_policy, batch_policy, sns_policy] + policies_list = [batch_policy, sns_policy] self.state_machine_role_arn = self.iam_helper.role_stitcher(self.state_machine_role_name, 'states', 'Permissions for statemachine to run jobs', @@ -942,7 +865,7 @@ def create_state_machine(self): "Message": "Batch job submitted through Step Functions succeeded", "TopicArn": "arn:aws:sns:{self.region}:{self.account}:{self.sns_state_machine_topic}" }}, - "Next": "Run EMR Job" + "End": true }}, "Notify Batch Failure": {{ "Type": "Task", @@ -953,36 +876,7 @@ def create_state_machine(self): }}, "Next": "Job Failure" }}, - "Run EMR Job": {{ - "Type": "Task", - "Resource": "arn:aws:lambda:{self.region}:{self.account}:function:{self.lambda_emr_job_step_function_name}", - "Next": "Notify EMR Job Success", - "Catch": [ - {{ - "ErrorEquals": [ "States.ALL" ], - "Next": "Notify EMR Job Failure" - }} - ] - }}, - "Notify EMR Job Success": {{ - "Type": "Task", - "Resource": "arn:aws:states:::sns:publish", - "Parameters": {{ - "Message": "EMR Job succeeded", - "TopicArn": "arn:aws:sns:{self.region}:{self.account}:{self.sns_state_machine_topic}" - }}, - "End": true - }}, - "Notify EMR Job Failure": {{ - "Type": "Task", - "Resource": "arn:aws:states:::sns:publish", - "Parameters": {{ - "Message": "EMR job failed", - "TopicArn": "arn:aws:sns:{self.region}:{self.account}:{self.sns_state_machine_topic}" - }}, - "Next": "Job Failure" - }}, - "Job Failure": {{ + "Job Failure": {{ "Type": "Fail" }} }} @@ -1045,30 +939,6 @@ def clean(self): except (KeyError, IndexError): self.vpc_id = None - logger.info("Cleaning up EMR.") - - try: - self.emr.terminate_job_flows( - JobFlowIds=[ - self.emr_cluster_name - ] - ) - logger.info(f"EMR cluster {self.emr_cluster_name} deleted.") - - except Exception as e: - if 'ResourceNotFoundException' in str(e): - logger.info(f"EMR cluster {self.emr_cluster_name} already MIA - skipping...") - - self.iam_helper.remove_role_from_instance_profile(self.emr_instance_profile_name) - self.iam_helper.delete_instance_profile(self.emr_instance_profile_name) - self.iam_helper.delete_role(self.emr_job_flow_role_name) - self.iam_helper.delete_role(self.emr_service_role_name) - - logger.info( - f"EMR clean complete. Results bucket and data {self.s3_bucket} have not been deleted." - ) - - logger.info(f'Deleting Security group {self.emr_cluster_security_group_name}.') default_sg_response = self.ec2.describe_security_groups( Filters=[ { @@ -1088,65 +958,6 @@ def clean(self): if len(dsg.ip_permissions_egress): response = dsg.revoke_egress(IpPermissions=dsg.ip_permissions_egress) - sg_response = self.ec2.describe_security_groups( - Filters=[ - { - 'Name': 'group-name', - 'Values': [ - self.emr_cluster_security_group_name, - ] - }, - ] - ) - - try: - group_id = sg_response['SecurityGroups'][0]['GroupId'] - sg = self.ec2r.SecurityGroup(group_id) - if len(sg.ip_permissions): - sg.revoke_ingress(IpPermissions=sg.ip_permissions) - - while True: - try: - self.ec2.delete_security_group( - GroupId=group_id - ) - break - except ClientError: - logger.info("Waiting for security group ingress rules to be removed ...") - time.sleep(5) - - logger.info(f"Deleted security group {self.emr_cluster_security_group_name}.") - except Exception as e: - if 'does not exist' in str(e) or 'list index out of range' in str(e): - logger.info(f'Security group {self.emr_cluster_security_group_name} does not exist - skipping...') - else: - raise - - try: - self.aws_lambda.delete_function( - FunctionName=self.lambda_emr_job_step_function_name - ) - except Exception as e: - if 'Function not found' in str(e): - logger.info(f"Function {self.lambda_emr_job_step_function_name} not found, skipping...") - else: - raise - - try: - self.s3.delete_object(Bucket=self.s3_bucket, Key=self.s3_lambda_code_emr_cluster_key) - logger.info( - f"S3 object {self.s3_lambda_code_emr_cluster_key} for bucket {self.s3_bucket} deleted." # noqa E501 - ) - except Exception as e: - if 'NoSuchBucket' in str(e): - logger.info( - f"S3 object {self.s3_lambda_code_emr_cluster_key} for bucket {self.s3_bucket} missing - not deleted." # noqa E501 - ) - else: - raise - - self.iam_helper.delete_role(self.lambda_emr_job_step_execution_role) - state_machines = self.step_functions.list_state_machines() for sm in state_machines['stateMachines']: @@ -1387,343 +1198,6 @@ def clean(self): AllocationId=this_address ) - def create_emr_security_groups(self): - - try: - response = self.ec2.create_security_group( - Description='EMR Job Flow Security Group (full cluster access)', - GroupName=self.emr_cluster_security_group_name, - VpcId=self.vpc_id - ) - self.emr_cluster_security_group_id = response['GroupId'] - - except Exception as e: - if 'already exists for VPC' in str(e): - logger.info("Security group for EMR already exists, skipping ...") - response = self.ec2.describe_security_groups( - Filters=[ - { - 'Name': 'group-name', - 'Values': [ - self.emr_cluster_security_group_name, - ] - }, - ] - ) - - self.emr_cluster_security_group_id = response['SecurityGroups'][0]['GroupId'] - else: - raise - - try: - response = self.ec2.authorize_security_group_ingress( - GroupId=self.emr_cluster_security_group_id, - IpPermissions=[dict( - IpProtocol='-1', - UserIdGroupPairs=[dict( - GroupId=self.emr_cluster_security_group_id, - UserId=self.account - )] - )] - ) - except Exception as e: - if 'already exists' in str(e): - logger.info("Security group egress rule for EMR already exists, skipping ...") - else: - raise - - try: - response = self.ec2.create_security_group( - Description='EMR Service Access Security Group', - GroupName=self.emr_service_access_security_group_name, - VpcId=self.vpc_id - ) - except Exception as e: - if 'already exists for VPC' in str(e): - logger.info("Security group for EMR service access already exists, skipping ...") - response = self.ec2.describe_security_groups( - Filters=[ - { - 'Name': 'group-name', - 'Values': [ - self.emr_service_access_security_group_name, - ] - }, - ] - ) - self.emr_service_access_security_group_id = response['SecurityGroups'][0]['GroupId'] - else: - self.emr_service_access_security_group_id = response['GroupId'] - - # See https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html#emr-sg-elasticmapreduce-sa-private - try: - response = self.ec2.authorize_security_group_ingress( - GroupId=self.emr_service_access_security_group_id, - IpPermissions=[dict( - FromPort=9443, - ToPort=9443, - IpProtocol='tcp', - UserIdGroupPairs=[dict( - GroupId=self.emr_cluster_security_group_id, - UserId=self.account - )] - )] - ) - except Exception as e: - if 'already exists' in str(e): - logger.info("Security group ingress rule for EMR already exists, skipping ...") - else: - raise - - try: - response = self.ec2.authorize_security_group_egress( - GroupId=self.emr_service_access_security_group_id, - IpPermissions=[dict( - FromPort=8443, - ToPort=8443, - IpProtocol='tcp', - UserIdGroupPairs=[dict( - GroupId=self.emr_cluster_security_group_id, - UserId=self.account - )] - )] - ) - except Exception as e: - if 'already exists' in str(e): - logger.info("Security group egress rule for EMR already exists, skipping ...") - else: - raise - - def create_emr_iam_roles(self): - - self.emr_service_role_arn = self.iam_helper.role_stitcher( - self.emr_service_role_name, - "elasticmapreduce", - f"EMR Service Role {self.job_identifier}", - managed_policie_arns=['arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceRole'] - ) - - emr_policy = '''{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "VisualEditor0", - "Effect": "Allow", - "Action": [ - "glue:GetCrawler", - "glue:CreateTable", - "glue:DeleteCrawler", - "glue:StartCrawler", - "glue:StopCrawler", - "glue:DeleteTable", - "glue:ListCrawlers", - "glue:UpdateCrawler", - "glue:CreateCrawler", - "glue:GetCrawlerMetrics", - "glue:BatchDeleteTable" - ], - "Resource": "*" - }, - { - "Sid": "VisualEditor1", - "Effect": "Allow", - "Action": [ - "iam:PassRole" - ], - "Resource": "arn:aws:iam::*:role/service-role/AWSGlueServiceRole-default" - } - ] -}''' - - self.emr_job_flow_role_arn = self.iam_helper.role_stitcher( - self.emr_job_flow_role_name, - "ec2", - f"EMR Job Flow Role {self.job_identifier}", - managed_policie_arns=['arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role'], - policies_list=[emr_policy] - ) - - try: - response = self.iam.create_instance_profile( - InstanceProfileName=self.emr_instance_profile_name - ) - - self.emr_instance_profile_arn = response['InstanceProfile']['Arn'] - - logger.info("EMR Instance Profile created") - - response = self.iam.add_role_to_instance_profile( - InstanceProfileName=self.emr_instance_profile_name, - RoleName=self.emr_job_flow_role_name - ) - - except Exception as e: - if 'EntityAlreadyExists' in str(e): - logger.info('EMR Instance Profile not created - already exists') - response = self.iam.get_instance_profile( - InstanceProfileName=self.emr_instance_profile_name - ) - self.emr_instance_profile_arn = response['InstanceProfile']['Arn'] - - def upload_assets(self): - - logger.info('Uploading EMR support assets...') - fs = S3FileSystem() - here = os.path.dirname(os.path.abspath(__file__)) - emr_folder = f"{self.s3_bucket}/{self.s3_bucket_prefix}/{self.s3_emr_folder_name}" - fs.makedirs(emr_folder) - - # bsb_post.sh - bsb_post_bash = f'''#!/bin/bash - -aws s3 cp "s3://{self.s3_bucket}/{self.s3_bucket_prefix}/emr/bsb_post.py" bsb_post.py -/home/hadoop/miniconda/bin/python bsb_post.py "{self.s3_bucket}" "{self.s3_bucket_prefix}" - - ''' - with fs.open(f'{emr_folder}/bsb_post.sh', 'w', encoding='utf-8') as f: - f.write(bsb_post_bash) - - # bsb_post.py - fs.put(os.path.join(here, 's3_assets', 'bsb_post.py'), f'{emr_folder}/bsb_post.py') - - # bootstrap-dask-custom - fs.put(os.path.join(here, 's3_assets', 'bootstrap-dask-custom'), f'{emr_folder}/bootstrap-dask-custom') - - # postprocessing.py - with fs.open(f'{emr_folder}/postprocessing.tar.gz', 'wb') as f: - with tarfile.open(fileobj=f, mode='w:gz') as tarf: - tarf.add(os.path.join(here, '..', 'postprocessing.py'), arcname='postprocessing.py') - tarf.add(os.path.join(here, 's3_assets', 'setup_postprocessing.py'), arcname='setup.py') - - logger.info('EMR support assets uploaded.') - - def create_emr_cluster_function(self): - script_name = f"s3://{self.s3_bucket}/{self.s3_bucket_prefix}/{self.s3_emr_folder_name}/bsb_post.sh" - bootstrap_action = f's3://{self.s3_bucket}/{self.s3_bucket_prefix}/{self.s3_emr_folder_name}/bootstrap-dask-custom' # noqa E501 - - run_job_flow_args = dict( - Name=self.emr_cluster_name, - LogUri=self.emr_log_uri, - - ReleaseLabel='emr-5.36.0', - Instances={ - 'InstanceGroups': [ - { - 'Market': 'SPOT' if self.batch_use_spot else 'ON_DEMAND', - 'InstanceRole': 'MASTER', - 'InstanceType': self.emr_manager_instance_type, - 'InstanceCount': 1 - }, - { - 'Market': 'SPOT' if self.batch_use_spot else 'ON_DEMAND', - 'InstanceRole': 'CORE', - 'InstanceType': self.emr_worker_instance_type, - 'InstanceCount': self.emr_worker_instance_count - }, - ], - 'Ec2SubnetId': self.priv_vpc_subnet_id_1, - 'KeepJobFlowAliveWhenNoSteps': False, - 'EmrManagedMasterSecurityGroup': self.emr_cluster_security_group_id, - 'EmrManagedSlaveSecurityGroup': self.emr_cluster_security_group_id, - 'ServiceAccessSecurityGroup': self.emr_service_access_security_group_id - }, - - Applications=[ - { - 'Name': 'Hadoop' - }, - ], - - BootstrapActions=[ - { - 'Name': 'launchFromS3', - 'ScriptBootstrapAction': { - 'Path': bootstrap_action, - 'Args': [f's3://{self.s3_bucket}/{self.s3_bucket_prefix}/emr/postprocessing.tar.gz'] - } - }, - ], - - Steps=[ - { - 'Name': 'Dask', - 'ActionOnFailure': 'TERMINATE_CLUSTER', - - 'HadoopJarStep': { - 'Jar': 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar', - 'Args': [script_name] - } - }, - ], - - VisibleToAllUsers=True, - JobFlowRole=self.emr_instance_profile_name, - ServiceRole=self.emr_service_role_name, - Tags=self.get_tags_uppercase(), - AutoScalingRole='EMR_AutoScaling_DefaultRole', - ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION', - EbsRootVolumeSize=100 - ) - - with io.BytesIO() as f: - f.write(json.dumps(run_job_flow_args).encode()) - f.seek(0) - self.s3.upload_fileobj(f, self.s3_bucket, self.s3_lambda_emr_config_key) - - lambda_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3_assets', 'lambda_function.py') - with open(lambda_filename, 'r') as f: - function_script = f.read() - with io.BytesIO() as f: - with zipfile.ZipFile(f, mode='w', compression=zipfile.ZIP_STORED) as zf: - zi = zipfile.ZipInfo('emr_function.py') - zi.date_time = time.localtime() - zi.external_attr = 0o100755 << 16 - zf.writestr(zi, function_script, zipfile.ZIP_DEFLATED) - f.seek(0) - self.s3.upload_fileobj(f, self.s3_bucket, self.s3_lambda_code_emr_cluster_key) - - while True: - try: - self.aws_lambda.create_function( - FunctionName=self.lambda_emr_job_step_function_name, - Runtime='python3.7', - Role=self.lambda_emr_job_step_execution_role_arn, - Handler='emr_function.lambda_handler', - Code={ - 'S3Bucket': self.s3_bucket, - 'S3Key': self.s3_lambda_code_emr_cluster_key - }, - Description=f'Lambda for emr cluster execution on job {self.job_identifier}', - Timeout=900, - MemorySize=128, - Publish=True, - Environment={ - 'Variables': { - 'REGION': self.region, - 'BUCKET': self.s3_bucket, - 'EMR_CONFIG_JSON_KEY': self.s3_lambda_emr_config_key - } - }, - Tags=self.get_tags(job=self.job_identifier) - ) - - logger.info(f"Lambda function {self.lambda_emr_job_step_function_name} created.") - break - - except Exception as e: - if 'role defined for the function cannot be assumed' in str(e): - logger.info( - f"Lambda role not registered for {self.lambda_emr_job_step_function_name} - sleeping ...") - time.sleep(5) - elif 'Function already exist' in str(e): - logger.info(f'Lambda function {self.lambda_emr_job_step_function_name} exists, skipping...') - break - elif 'ARN does not refer to a valid principal' in str(e): - logger.info('Waiting for roles/permissions to propagate to allow Lambda function creation ...') - time.sleep(5) - else: - raise - class AwsSNS(AwsJobBase): @@ -1786,8 +1260,6 @@ def validate_instance_types(project_file): ec2 = boto3_session.client('ec2', config=boto_client_config) job_base = AwsJobBase('genericjobid', aws_config, boto3_session) instance_types_requested = set() - instance_types_requested.add(job_base.emr_manager_instance_type) - instance_types_requested.add(job_base.emr_worker_instance_type) inst_type_resp = ec2.describe_instance_type_offerings(Filters=[{ 'Name': 'instance-type', 'Values': list(instance_types_requested) @@ -2051,13 +1523,6 @@ def run_batch(self): batch_env.create_state_machine_roles() batch_env.create_state_machine() - # EMR Function - batch_env.upload_assets() - batch_env.create_emr_iam_roles() - batch_env.create_emr_security_groups() - batch_env.create_emr_lambda_roles() - batch_env.create_emr_cluster_function() - # start job batch_env.start_state_machine_execution(array_size) diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index 61df7488..3a15f1a5 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -183,27 +183,6 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.s3_lambda_emr_config_key = f'{self.s3_bucket_prefix}/lambda_functions/emr_config.json' self.s3_emr_folder_name = 'emr' - # EMR - emr_config = aws_config.get('emr', {}) - self.emr_manager_instance_type = emr_config.get('manager_instance_type', 'm5.4xlarge') - self.emr_worker_instance_type = emr_config.get('worker_instance_type', 'r5.4xlarge') - self.emr_worker_instance_count = emr_config.get('worker_instance_count', 4) - self.emr_cluster_security_group_name = f'{self.job_identifier}_emr_security_group' - self.emr_service_access_security_group_name = f'{self.job_identifier}_emr_service_access' - self.emr_cluster_name = f'{self.job_identifier}_emr_dask_cluster' - self.emr_job_flow_role_name = f'{self.job_identifier}_emr_job_flow_role' - self.emr_job_flow_role_arn = '' - self.emr_service_role_name = f'{self.job_identifier}_emr_service_role' - self.emr_service_role_arn = '' - self.emr_cluster_security_group_id = '' - self.emr_log_uri = f's3://{self.s3_bucket}/{self.s3_bucket_prefix}/emrlogs/' - self.emr_instance_profile_name = f'{self.job_identifier}_emr_instance_profile' - - # Lambda - self.lambda_emr_job_step_execution_role = f'{self.job_identifier}_emr_job_step_execution_role' - self.lambda_emr_job_step_function_name = f'{self.job_identifier}_emr_job_step_submission' - self.lambda_emr_job_step_execution_role_arn = '' - # Batch self.batch_compute_environment_name = f"computeenvionment_{self.job_identifier}" self.launch_template_name = f"launch_templ_{self.job_identifier}" From 52c94845a2adef3b90259efba94bf9a2718e28a2 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 26 Jan 2023 17:11:01 +0000 Subject: [PATCH 12/53] removing state machine and sns --- buildstockbatch/aws/aws.py | 213 +----------------------------- buildstockbatch/aws/awsbase.py | 9 +- buildstockbatch/schemas/v0.3.yaml | 7 - 3 files changed, 8 insertions(+), 221 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 34f4424a..c5f8d65d 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -783,143 +783,6 @@ def submit_job(self, array_size=4): else: raise - def create_state_machine_roles(self): - - batch_policy = '''{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "batch:SubmitJob", - "batch:DescribeJobs", - "batch:TerminateJob" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "events:PutTargets", - "events:PutRule", - "events:DescribeRule" - ], - "Resource": [ - "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule" - ] - } - ] -} - - ''' - - sns_policy = f'''{{ - "Version": "2012-10-17", - "Statement": [ - {{ - "Effect": "Allow", - "Action": [ - "sns:Publish" - ], - "Resource": "arn:aws:sns:*:*:{self.sns_state_machine_topic}" - }} - ] - }} - ''' - - policies_list = [batch_policy, sns_policy] - - self.state_machine_role_arn = self.iam_helper.role_stitcher(self.state_machine_role_name, 'states', - 'Permissions for statemachine to run jobs', - policies_list=policies_list) - - def create_state_machine(self): - - job_definition = f'''{{ - "Comment": "An example of the Amazon States Language for notification on an AWS Batch job completion", - "StartAt": "Submit Batch Job", - "States": {{ - "Submit Batch Job": {{ - "Type": "Task", - "Resource": "arn:aws:states:::batch:submitJob.sync", - "Parameters": {{ - "JobDefinition": "{self.job_definition_arn}", - "JobName": "{self.job_identifier}", - "JobQueue": "{self.job_queue_arn}", - "ArrayProperties": {{ - "Size.$": "$.array_size" - }} - }}, - "Next": "Notify Batch Success", - "Catch": [ - {{ - "ErrorEquals": [ "States.ALL" ], - "Next": "Notify Batch Failure" - }} - ] - }}, - "Notify Batch Success": {{ - "Type": "Task", - "Resource": "arn:aws:states:::sns:publish", - "Parameters": {{ - "Message": "Batch job submitted through Step Functions succeeded", - "TopicArn": "arn:aws:sns:{self.region}:{self.account}:{self.sns_state_machine_topic}" - }}, - "End": true - }}, - "Notify Batch Failure": {{ - "Type": "Task", - "Resource": "arn:aws:states:::sns:publish", - "Parameters": {{ - "Message": "Batch job submitted through Step Functions failed", - "TopicArn": "arn:aws:sns:{self.region}:{self.account}:{self.sns_state_machine_topic}" - }}, - "Next": "Job Failure" - }}, - "Job Failure": {{ - "Type": "Fail" - }} - }} -}} - - ''' - - while True: - - try: - response = self.step_functions.create_state_machine( - name=self.state_machine_name, - definition=job_definition, - roleArn=self.state_machine_role_arn, - tags=self.get_tags_lowercase() - ) - - # print(response) - self.state_machine_arn = response['stateMachineArn'] - logger.info(f"State machine {self.state_machine_name} created.") - break - except Exception as e: - if "AccessDeniedException" in str(e): - logger.info("State machine role not yet registered, sleeping...") - time.sleep(5) - elif "StateMachineAlreadyExists" in str(e): - logger.info("State machine already exists, skipping...") - self.state_machine_arn = f"arn:aws:states:{self.region}:{self.account}:stateMachine:{self.state_machine_name}" # noqa E501 - - break - else: - raise - - def start_state_machine_execution(self, array_size): - - self.step_functions.start_execution( - stateMachineArn=self.state_machine_arn, - name=f'{self.state_machine_name}_execution_{int(time.time())}', - input=f'{{"array_size": {array_size}}}' - ) - - logger.info(f"Starting state machine {self.state_machine_name}.") - def clean(self): # Get our vpc: @@ -958,19 +821,6 @@ def clean(self): if len(dsg.ip_permissions_egress): response = dsg.revoke_egress(IpPermissions=dsg.ip_permissions_egress) - state_machines = self.step_functions.list_state_machines() - - for sm in state_machines['stateMachines']: - if sm['name'] == self.state_machine_name: - self.state_machine_arn = sm['stateMachineArn'] - self.step_functions.delete_state_machine( - stateMachineArn=self.state_machine_arn - ) - logger.info(f"Deleted state machine {self.state_machine_name}.") - break - - self.iam_helper.delete_role(self.state_machine_role_name) - try: self.batch.update_job_queue( @@ -1199,42 +1049,6 @@ def clean(self): ) -class AwsSNS(AwsJobBase): - - def __init__(self, job_name, aws_config, boto3_session): - super().__init__(job_name, aws_config, boto3_session) - self.sns = self.session.client("sns", config=boto_client_config) - self.sns_state_machine_topic_arn = None - - def create_topic(self): - response = self.sns.create_topic( - Name=self.sns_state_machine_topic, - Tags=self.get_tags_uppercase() - ) - - logger.info(f"Simple notifications topic {self.sns_state_machine_topic} created.") - - self.sns_state_machine_topic_arn = response['TopicArn'] - - def subscribe_to_topic(self): - self.sns.subscribe( - TopicArn=self.sns_state_machine_topic_arn, - Protocol='email', - Endpoint=self.operator_email - ) - - logger.info( - f"Operator {self.operator_email} subscribed to topic - please confirm via email to recieve state machine progress messages." # noqa 501 - ) - - def clean(self): - self.sns.delete_topic( - TopicArn=f"arn:aws:sns:{self.region}:{self.account}:{self.sns_state_machine_topic}" - ) - - logger.info(f"Simple notifications topic {self.sns_state_machine_topic} deleted.") - - class AwsBatch(DockerBatchBase): def __init__(self, project_filename): @@ -1350,9 +1164,6 @@ def clean(self): batch_env = AwsBatchEnv(self.job_identifier, self.cfg['aws'], self.boto3_session) batch_env.clean() - sns_env = AwsSNS(self.job_identifier, self.cfg['aws'], self.boto3_session) - sns_env.clean() - def run_batch(self): """ Run a batch of simulations using AWS Batch @@ -1417,12 +1228,7 @@ def run_batch(self): n_sims = n_datapoints * (len(self.cfg.get('upgrades', [])) + 1) logger.debug('Total number of simulations = {}'.format(n_sims)) - # This is the maximum number of jobs that can be in an array - if self.batch_array_size <= 10000: - max_array_size = self.batch_array_size - else: - max_array_size = 10000 - n_sims_per_job = math.ceil(n_sims / max_array_size) + n_sims_per_job = math.ceil(n_sims / self.batch_array_size) n_sims_per_job = max(n_sims_per_job, 2) logger.debug('Number of simulations per array job = {}'.format(n_sims_per_job)) @@ -1514,18 +1320,8 @@ def run_batch(self): env_vars=env_vars ) - # SNS Topic - sns_env = AwsSNS(self.job_identifier, self.cfg['aws'], self.boto3_session) - sns_env.create_topic() - sns_env.subscribe_to_topic() - - # State machine - batch_env.create_state_machine_roles() - batch_env.create_state_machine() - # start job - batch_env.start_state_machine_execution(array_size) - + batch_env.submit_job(array_size=self.batch_array_size) logger.info('Batch job submitted. Check your email to subscribe to notifications.') @classmethod @@ -1713,6 +1509,11 @@ def main(): 'level': 'DEBUG', 'propagate': True, 'handlers': ['console'] + }, + 'buildstockbatch.aws': { + 'level': 'DEBUG', + 'propagate': True, + 'handlers': ['console'] } }, }) diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index 3a15f1a5..b5155033 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -196,13 +196,6 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.batch_use_spot = aws_config.get('use_spot', True) self.batch_spot_bid_percent = aws_config.get('spot_bid_percent', 100) - # Step Functions - self.state_machine_name = f"{self.job_identifier}_state_machine" - self.state_machine_role_name = f"{self.job_identifier}_state_machine_role" - - # SNS - self.sns_state_machine_topic = f"{self.job_identifier}_state_machine_notifications" - # VPC self.vpc_name = self.job_identifier self.vpc_id = '' # will be available after VPC creation @@ -230,7 +223,7 @@ def __repr__(self): S3 Bucket for Source Data: {self.s3_bucket} S3 Prefix for Source Data: {self.s3_bucket_prefix} -A state machine {self.state_machine_name} will execute an AWS Batch job {self.job_identifier} against the source data. +This will execute an AWS Batch job {self.job_identifier} against the source data. Notifications of execution progress will be sent to {self.operator_email} once the email subscription is confirmed. Once processing is complete the state machine will then launch an EMR cluster with a job to combine the results and create an AWS Glue table. diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml index df80debc..b2bee7e3 100644 --- a/buildstockbatch/schemas/v0.3.yaml +++ b/buildstockbatch/schemas/v0.3.yaml @@ -23,7 +23,6 @@ aws-spec: spot_bid_percent: num(min=1, max=100, required=False) batch_array_size: num(min=1, max=10000, required=True) notifications_email: regex('^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', name='email', required=True) - emr: include('aws-emr-spec', required=False) job_environment: include('aws-job-environment', required=False) tags: map(str(), str(), required=False) @@ -31,12 +30,6 @@ aws-job-environment: vcpus: int(min=1, max=36, required=False) memory: int(min=1024, required=False) -aws-emr-spec: - manager_instance_type: str(required=False) - worker_instance_type: str(required=False) - worker_instance_count: int(min=1, required=False) - dask_worker_vcores: int(min=1, required=False) - hpc-spec: account: str(required=True) minutes_per_sim: int(max=120, required=False) From 474537fb0921ef0bad280b27709540bd0275b860 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Fri, 27 Jan 2023 21:28:46 +0000 Subject: [PATCH 13/53] more cleanup, waiting for batch run to complete with progress bar --- Dockerfile | 2 +- buildstockbatch/aws/aws.py | 34 ++++-- .../aws/s3_assets/bootstrap-dask-custom | 107 ------------------ .../aws/s3_assets/lambda_function.py | 24 ---- .../aws/s3_assets/setup_postprocessing.py | 15 --- setup.py | 4 + 6 files changed, 30 insertions(+), 156 deletions(-) delete mode 100644 buildstockbatch/aws/s3_assets/bootstrap-dask-custom delete mode 100644 buildstockbatch/aws/s3_assets/lambda_function.py delete mode 100644 buildstockbatch/aws/s3_assets/setup_postprocessing.py diff --git a/Dockerfile b/Dockerfile index 3e2b448f..b8b248db 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,4 +4,4 @@ FROM --platform=linux/amd64 nrel/openstudio:$OS_VER RUN sudo apt update && sudo apt install -y python3-pip RUN sudo -H pip install --upgrade pip COPY . /buildstock-batch/ -RUN python3 -m pip install /buildstock-batch +RUN python3 -m pip install "/buildstock-batch[aws]" diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index c5f8d65d..9145bc11 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -34,6 +34,7 @@ import tempfile import re import time +import tqdm import io import zipfile @@ -761,7 +762,7 @@ def submit_job(self, array_size=4): while True: try: - self.batch.submit_job( + resp = self.batch.submit_job( jobName=self.job_identifier, jobQueue=self.batch_job_queue_name, arrayProperties={ @@ -772,7 +773,7 @@ def submit_job(self, array_size=4): ) logger.info(f"Job {self.job_identifier} submitted.") - break + return resp except Exception as e: @@ -1321,8 +1322,28 @@ def run_batch(self): ) # start job - batch_env.submit_job(array_size=self.batch_array_size) - logger.info('Batch job submitted. Check your email to subscribe to notifications.') + job_info = batch_env.submit_job(array_size=self.batch_array_size) + + # Monitor job status + with tqdm.tqdm(desc="Running Simulations", total=self.batch_array_size) as progress_bar: + job_status = None + while job_status not in ('SUCCEEDED', 'FAILED'): + time.sleep(10) + job_desc_resp = batch_env.batch.describe_jobs(jobs=[job_info['jobId']]) + job_status = job_desc_resp['jobs'][0]['status'] + + jobs_resp = batch_env.batch.list_jobs(arrayJobId=job_info['jobId'], jobStatus='SUCCEEDED') + n_succeeded = len(jobs_resp["jobSummaryList"]) + next_token = jobs_resp.get("nextToken") + while next_token is not None: + jobs_resp = batch_env.batch.list_jobs(arrayJobId=job_info['jobId'], jobStatus='SUCCEEDED', nextToken=next_token) + n_succeeded += len(jobs_resp["jobSummaryList"]) + next_token = jobs_resp.get("nextToken") + progress_bar.update(n_succeeded) + + logger.info(f"Batch job status: {job_status}") + if job_status == "FAILED": + raise RuntimeError("Batch Job Failed. Go look at the CloudWatch logs.") @classmethod def run_job(cls, job_id, bucket, prefix, job_name, region): @@ -1510,11 +1531,6 @@ def main(): 'propagate': True, 'handlers': ['console'] }, - 'buildstockbatch.aws': { - 'level': 'DEBUG', - 'propagate': True, - 'handlers': ['console'] - } }, }) print(AwsBatch.LOGO) diff --git a/buildstockbatch/aws/s3_assets/bootstrap-dask-custom b/buildstockbatch/aws/s3_assets/bootstrap-dask-custom deleted file mode 100644 index 0e2daa7c..00000000 --- a/buildstockbatch/aws/s3_assets/bootstrap-dask-custom +++ /dev/null @@ -1,107 +0,0 @@ -#!/bin/bash - -set -e - -# ----------------------------------------------------------------------------- -# 1. Check if running on the master node. If not, there's nothing do. -# ----------------------------------------------------------------------------- -grep -q '"isMaster": true' /mnt/var/lib/info/instance.json \ -|| { echo "Not running on master node, nothing to do" && exit 0; } - - -# ----------------------------------------------------------------------------- -# 2. Install Miniconda -# ----------------------------------------------------------------------------- -echo "Installing Miniconda" -curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh -bash /tmp/miniconda.sh -b -p $HOME/miniconda -rm /tmp/miniconda.sh -echo -e '\nexport PATH=$HOME/miniconda/bin:$PATH' >> $HOME/.bashrc -source $HOME/.bashrc -conda install mamba -n base -c conda-forge - - -# ----------------------------------------------------------------------------- -# 3. Install packages to use in packaged environment -# -# We install a few packages by default, and allow users to extend this list -# with a CLI flag: -# -# - dask-yarn >= 0.4.1, for deploying Dask on YARN. -# - pyarrow for working with hdfs, parquet, ORC, etc... -# - s3fs for access to s3 -# - nomkl to minimize environment size -# - conda-pack for packaging the environment for distribution -# ----------------------------------------------------------------------------- -echo "Installing base packages" -mamba install \ --c conda-forge \ --y \ --q \ -"python=3.10" \ -"dask>=2022.10.0" \ -"distributed>=2022.10.0" \ -"dask-yarn>=0.9.0" \ -"pandas" \ -"pyarrow" \ -"numpy" \ -"s3fs" \ -"boto3" \ -"conda-pack" - -aws s3 cp "$1" $HOME/postprocessing.tar.gz -pip install $HOME/postprocessing.tar.gz - -# ----------------------------------------------------------------------------- -# 4. Package the environment to be distributed to worker nodes -# ----------------------------------------------------------------------------- -echo "Packaging environment" -conda pack -q -o $HOME/environment.tar.gz - - -# ----------------------------------------------------------------------------- -# 5. List all packages in the worker environment -# ----------------------------------------------------------------------------- -echo "Packages installed in the worker environment:" -conda list - - -# ----------------------------------------------------------------------------- -# 6. Configure Dask -# -# This isn't necessary, but for this particular bootstrap script it will make a -# few things easier: -# -# - Configure the cluster's dashboard link to show the proxied version through -# jupyter-server-proxy. This allows access to the dashboard with only an ssh -# tunnel to the notebook. -# -# - Specify the pre-packaged python environment, so users don't have to -# -# - Set the default deploy-mode to local, so the dashboard proxying works -# -# - Specify the location of the native libhdfs library so pyarrow can find it -# on the workers and the client (if submitting applications). -# ------------------------------------------------------------------------------ -echo "Configuring Dask" -mkdir -p $HOME/.config/dask -cat <> $HOME/.config/dask/config.yaml -distributed: - dashboard: - link: "/proxy/{port}/status" - -yarn: - environment: /home/hadoop/environment.tar.gz - deploy-mode: local - - worker: - env: - ARROW_LIBHDFS_DIR: /usr/lib/hadoop/lib/native/ - - client: - env: - ARROW_LIBHDFS_DIR: /usr/lib/hadoop/lib/native/ -EOT -# Also set ARROW_LIBHDFS_DIR in ~/.bashrc so it's set for the local user -echo -e '\nexport ARROW_LIBHDFS_DIR=/usr/lib/hadoop/lib/native' >> $HOME/.bashrc - diff --git a/buildstockbatch/aws/s3_assets/lambda_function.py b/buildstockbatch/aws/s3_assets/lambda_function.py deleted file mode 100644 index d8c9c3d5..00000000 --- a/buildstockbatch/aws/s3_assets/lambda_function.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -import io -import json -import boto3 -from pprint import pprint - - -def lambda_handler(event, context): - # some prep work needed for this - check your security groups - there may default groups if any EMR cluster - # was launched from the console - also prepare a bucket for logs - - # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr.html - - session = boto3.Session(region_name=os.environ['REGION']) - - s3 = session.client('s3') - with io.BytesIO() as f: - s3.download_fileobj(os.environ['BUCKET'], os.environ['EMR_CONFIG_JSON_KEY'], f) - args = json.loads(f.getvalue()) - - emr = session.client("emr") - - response = emr.run_job_flow(**args) - pprint(response) diff --git a/buildstockbatch/aws/s3_assets/setup_postprocessing.py b/buildstockbatch/aws/s3_assets/setup_postprocessing.py deleted file mode 100644 index 6ff9d50a..00000000 --- a/buildstockbatch/aws/s3_assets/setup_postprocessing.py +++ /dev/null @@ -1,15 +0,0 @@ -from setuptools import setup - -setup( - name='buildstockbatch-postprocessing', - version='0.1', - description='Just the stand alone postprocessing functions from Buildstock-Batch', - py_modules=['postprocessing'], - # install_requires=[ - # 'dask[complete]>=2022.10.0', - # 's3fs[boto3]', - # 'pandas', - # 'pyarrow', - # 'numpy' - # ] -) diff --git a/setup.py b/setup.py index c7c2716b..b5778e79 100644 --- a/setup.py +++ b/setup.py @@ -84,6 +84,10 @@ def run_tests(self): 'flake8', 'rope', 'doc8' + 'tqdm', + ], + 'aws': [ + 'dask-cloudprovider[aws]', ] }, entry_points={ From ed5040719e987112dbfd4cb0390a7d3189b375ba Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Fri, 27 Jan 2023 23:49:37 +0000 Subject: [PATCH 14/53] adding dask cloudprovider to the mix, not tested --- buildstockbatch/aws/aws.py | 51 ++++++++++++++++++++++++++++++++------ buildstockbatch/base.py | 42 ++++++++++++++++++++----------- 2 files changed, 72 insertions(+), 21 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 9145bc11..7c133dde 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -15,6 +15,8 @@ from botocore.exceptions import ClientError import collections import csv +from dask.distributed import Client +from dask_cloudprovider.aws import FargateCluster from fsspec.implementations.local import LocalFileSystem import gzip import hashlib @@ -125,7 +127,7 @@ def copy_s3_file(src_bucket, src_key, dest_bucket, dest_key): class AwsBatchEnv(AwsJobBase): """ - Class to manage the AWS Batch environment and Step Function controller. + Class to manage the AWS Batch environment. """ def __init__(self, job_name, aws_config, boto3_session): @@ -784,6 +786,13 @@ def submit_job(self, array_size=4): else: raise + def create_dask_assets(self): + self.ecs.create_cluster( + clusterName=self.dask_ecs_cluster_name, + tags=self.get_tags_lowercase(), + capa + ) + def clean(self): # Get our vpc: @@ -1099,6 +1108,10 @@ def docker_image(self): def weather_dir(self): return self._weather_dir + @property + def results_dir(self): + return f'{self.s3_bucket}/{self.s3_bucket_prefix}/results' + @property def container_repo(self): repo_name = self.docker_image @@ -1112,6 +1125,9 @@ def container_repo(self): repo = resp['repository'] return repo + def image_url(self): + return f"{self.container_repo['repositoryUri']}:{self.job_identifier}" + def build_image(self): """ Build the docker image to use in the batch simulation @@ -1307,14 +1323,9 @@ def run_batch(self): env_vars = dict(S3_BUCKET=self.s3_bucket, S3_PREFIX=self.s3_bucket_prefix, JOB_NAME=self.job_identifier, REGION=self.region) - image_url = '{}:{}'.format( - self.container_repo['repositoryUri'], - self.job_identifier - ) - job_env_cfg = self.cfg['aws'].get('job_environment', {}) batch_env.create_job_definition( - image_url, + self.image_url, command=['python3', '-m', 'buildstockbatch.aws.aws'], vcpus=job_env_cfg.get('vcpus', 1), memory=job_env_cfg.get('memory', 1024), @@ -1500,6 +1511,32 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): elif os.path.isfile(item): os.remove(item) + def get_fs(self): + return S3FileSystem() + + def get_dask_client(self): + # TODO: Add (some of) these to the config and have sensible defaults. + batch_env = AwsBatchEnv(self.job_identifier, self.cfg['aws'], self.boto3_session) + m = 1024 + cluster = FargateCluster( + fargate_spot=True, + image=self.image_url, + cluster_name_template=f"dask-{self.job_identifier}", + scheduler_cpu=2 * m, + scheduler_mem=8 * m, + worker_cpu=2 * m, + worker_mem=8 * m, + n_workers=4, + task_role_policies=['arn:aws:iam::aws:policy/AmazonS3FullAccess'], + tags=batch_env.get_tags() + ) + client = Client(cluster) + return client + + def upload_results(self, *args, **kwargs): + """Do nothing because the results are already on S3""" + return self.s3_bucket, self.results_dir + @log_error_details() def main(): diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index 99904f27..c37e699d 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -792,25 +792,39 @@ def validate_openstudio_version(project_file): def get_dask_client(self): return Client() + def get_fs(self): + return LocalFileSystem() + + def upload_results(self, *args, **kwargs): + return postprocessing.upload_results(*args, **kwargs) + def process_results(self, skip_combine=False, force_upload=False): - self.get_dask_client() # noqa: F841 - - if self.cfg['workflow_generator']['type'] == 'residential_hpxml': - if 'simulation_output_report' in self.cfg['workflow_generator']['args'].keys(): - if 'timeseries_frequency' in self.cfg['workflow_generator']['args']['simulation_output_report'].keys(): - do_timeseries = \ - (self.cfg['workflow_generator']['args']['simulation_output_report']['timeseries_frequency'] != - 'none') - else: - do_timeseries = 'timeseries_csv_export' in self.cfg['workflow_generator']['args'].keys() + dask_client = self.get_dask_client() - fs = LocalFileSystem() - if not skip_combine: - postprocessing.combine_results(fs, self.results_dir, self.cfg, do_timeseries=do_timeseries) + try: + if self.cfg['workflow_generator']['type'] == 'residential_hpxml': + if 'simulation_output_report' in self.cfg['workflow_generator']['args'].keys(): + if 'timeseries_frequency' in self.cfg['workflow_generator']['args']['simulation_output_report'].keys(): + do_timeseries = \ + (self.cfg['workflow_generator']['args']['simulation_output_report']['timeseries_frequency'] != + 'none') + else: + do_timeseries = 'timeseries_csv_export' in self.cfg['workflow_generator']['args'].keys() + + fs = self.get_fs() + if not skip_combine: + postprocessing.combine_results(fs, self.results_dir, self.cfg, do_timeseries=do_timeseries) + finally: + cluster = getattr(dask_client, 'cluster', None) + dask_client.close() + try: + cluster.close() + except: + pass aws_conf = self.cfg.get('postprocessing', {}).get('aws', {}) if 's3' in aws_conf or force_upload: - s3_bucket, s3_prefix = postprocessing.upload_results(aws_conf, self.output_dir, self.results_dir) + s3_bucket, s3_prefix = self.upload_results(aws_conf, self.output_dir, self.results_dir) if 'athena' in aws_conf: postprocessing.create_athena_tables(aws_conf, os.path.basename(self.output_dir), s3_bucket, s3_prefix) From cf9b1fbbc03155affb36a8068756fb5107f3beb9 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Wed, 1 Feb 2023 22:45:35 +0000 Subject: [PATCH 15/53] first working version --- buildstockbatch/aws/aws.py | 68 ++++++++++++++++++++----------- buildstockbatch/base.py | 5 --- buildstockbatch/postprocessing.py | 8 ++-- setup.py | 4 +- 4 files changed, 51 insertions(+), 34 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 7c133dde..47a31b3d 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -599,20 +599,26 @@ def create_compute_environment(self, maxCPUs=10000): """ logger.debug(f"Creating launch template {self.launch_template_name}") - self.ec2.create_launch_template( - LaunchTemplateName=self.launch_template_name, - LaunchTemplateData={ - "BlockDeviceMappings": [ - { - "DeviceName": "/dev/xvda", - "Ebs": { - "VolumeSize": 100, - "VolumeType": "gp2" + try: + self.ec2.create_launch_template( + LaunchTemplateName=self.launch_template_name, + LaunchTemplateData={ + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/xvda", + "Ebs": { + "VolumeSize": 100, + "VolumeType": "gp2" + } } - } - ] - } - ) + ] + } + ) + except ClientError as error: + if error.response['Error']['Code'] == 'InvalidLaunchTemplateName.AlreadyExistsException': + logger.debug('Launch template exists, skipping creation') + else: + raise error while True: lt_resp = self.ec2.describe_launch_templates( @@ -786,13 +792,6 @@ def submit_job(self, array_size=4): else: raise - def create_dask_assets(self): - self.ecs.create_cluster( - clusterName=self.dask_ecs_cluster_name, - tags=self.get_tags_lowercase(), - capa - ) - def clean(self): # Get our vpc: @@ -1058,6 +1057,16 @@ def clean(self): AllocationId=this_address ) + try: + self.ec2.delete_security_group( + GroupName=f"dask-{self.job_identifier}" + ) + except ClientError as error: + if error.response['Error']['Code'] == 'InvalidGroup.NotFound': + pass + else: + raise error + class AwsBatch(DockerBatchBase): @@ -1125,6 +1134,7 @@ def container_repo(self): repo = resp['repository'] return repo + @property def image_url(self): return f"{self.container_repo['repositoryUri']}:{self.job_identifier}" @@ -1526,7 +1536,7 @@ def get_dask_client(self): scheduler_mem=8 * m, worker_cpu=2 * m, worker_mem=8 * m, - n_workers=4, + n_workers=12, task_role_policies=['arn:aws:iam::aws:policy/AmazonS3FullAccess'], tags=batch_env.get_tags() ) @@ -1581,16 +1591,22 @@ def main(): else: parser = argparse.ArgumentParser() parser.add_argument('project_filename') - parser.add_argument( + group = parser.add_mutually_exclusive_group() + group.add_argument( '-c', '--clean', action='store_true', help='After the simulation is done, run with --clean to clean up AWS environment' ) - parser.add_argument( + group.add_argument( '--validateonly', help='Only validate the project YAML file and references. Nothing is executed', action='store_true' ) + group.add_argument( + '--postprocessonly', + help='Only do postprocessing, useful for when the simulations are already done', + action='store_true' + ) args = parser.parse_args() # validate the project, and in case of the --validateonly flag return True if validation passes @@ -1601,10 +1617,16 @@ def main(): batch = AwsBatch(args.project_filename) if args.clean: batch.clean() + elif args.postprocessonly: + batch.build_image() + batch.push_image() + batch.process_results() else: batch.build_image() batch.push_image() batch.run_batch() + batch.process_results() + batch.clean() if __name__ == '__main__': diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index c37e699d..661a2bc3 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -815,12 +815,7 @@ def process_results(self, skip_combine=False, force_upload=False): if not skip_combine: postprocessing.combine_results(fs, self.results_dir, self.cfg, do_timeseries=do_timeseries) finally: - cluster = getattr(dask_client, 'cluster', None) dask_client.close() - try: - cluster.close() - except: - pass aws_conf = self.cfg.get('postprocessing', {}).get('aws', {}) if 's3' in aws_conf or force_upload: diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 5dff645c..054df70a 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -369,9 +369,9 @@ def get_upgrade_list(cfg): def write_metadata_files(fs, parquet_root_dir, partition_columns): - df = dd.read_parquet(parquet_root_dir) + df = dd.read_parquet(parquet_root_dir, filesystem=fs) sch = pa.Schema.from_pandas(df._meta_nonempty) - parquet.write_metadata(sch, f"{parquet_root_dir}/_common_metadata") + parquet.write_metadata(sch, f"{parquet_root_dir}/_common_metadata", filesystem=fs) logger.info(f"Written _common_metadata to {parquet_root_dir}") if partition_columns: @@ -555,12 +555,12 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ts_out_loc = f"{ts_dir}/upgrade={upgrade_id}/" else: assert isinstance(fs, S3FileSystem) - ts_out_loc = f"s3://{ts_dir}/upgrade={upgrade_id}/" + ts_out_loc = f"s3://{ts_dir}/upgrade={upgrade_id}" fs.makedirs(ts_out_loc) logger.info(f'Created directory {ts_out_loc} for writing. Now concatenating ...') - src_path = f'{ts_in_dir}/up{upgrade_id:02d}/' + src_path = f'{ts_in_dir}/up{upgrade_id:02d}' concat_partial = dask.delayed(partial(concat_and_normalize, fs, all_ts_cols_sorted, src_path, ts_out_loc, partition_columns)) partition_vals_list = [list(partition_df.loc[bldg_id_list[0]].values) if partition_columns else [] diff --git a/setup.py b/setup.py index b5778e79..773c466f 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,8 @@ def run_tests(self): 'fsspec', 'yamale', 'ruamel.yaml', - 'lxml' + 'lxml', + 'tqdm', ], extras_require={ 'dev': [ @@ -84,7 +85,6 @@ def run_tests(self): 'flake8', 'rope', 'doc8' - 'tqdm', ], 'aws': [ 'dask-cloudprovider[aws]', From 55c91a0276f0165d5fc089e53ea4bd41ad00c92c Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 2 Feb 2023 22:17:43 +0000 Subject: [PATCH 16/53] adding config options for dask --- buildstockbatch/aws/aws.py | 65 +++++++++++++-------- buildstockbatch/aws/s3_assets/bsb_post.py | 69 ----------------------- buildstockbatch/schemas/v0.3.yaml | 15 +++++ buildstockbatch/test/test_validation.py | 39 +++++++++++++ 4 files changed, 96 insertions(+), 92 deletions(-) delete mode 100644 buildstockbatch/aws/s3_assets/bsb_post.py diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 47a31b3d..613c9b63 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -1086,28 +1086,42 @@ def __init__(self, project_filename): self.boto3_session = boto3.Session(region_name=self.region) @staticmethod - def validate_instance_types(project_file): + def validate_dask_settings(project_file): cfg = get_project_configuration(project_file) - aws_config = cfg['aws'] - boto3_session = boto3.Session(region_name=aws_config['region']) - ec2 = boto3_session.client('ec2', config=boto_client_config) - job_base = AwsJobBase('genericjobid', aws_config, boto3_session) - instance_types_requested = set() - inst_type_resp = ec2.describe_instance_type_offerings(Filters=[{ - 'Name': 'instance-type', - 'Values': list(instance_types_requested) - }]) - instance_types_available = set([x['InstanceType'] for x in inst_type_resp['InstanceTypeOfferings']]) - if not instance_types_requested == instance_types_available: - instance_types_not_available = instance_types_requested - instance_types_available - raise ValidationError( - f"The instance type(s) {', '.join(instance_types_not_available)} are not available in region {aws_config['region']}." # noqa E501 - ) + if "emr" in cfg["aws"]: + logger.warning("The `aws.emr` configuration is no longer used and is ignored. Recommend removing.") + dask_cfg = cfg["aws"]["dask"] + errors = [] + mem_rules = { + 1024: (2, 8, 1), + 2048: (4, 16, 1), + 4096: (8, 30, 1), + 8192: (16, 60, 4), + 16384: (32, 120, 8), + } + for node_type in ("scheduler", "worker"): + mem = dask_cfg[f"{node_type}_memory"] + if mem % 1024 != 0: + errors.append( + f"`aws.dask.{node_type}_memory` = {mem}, needs to be a multiple of 1024." + ) + mem_gb = dask_cfg[f'{node_type}_memory'] // 1024 + min_gb, max_gb, incr_gb = mem_rules[dask_cfg[f"{node_type}_cpu"]] + if not (min_gb <= mem_gb <= max_gb and (mem_gb - min_gb) % incr_gb == 0): + errors.append( + f"`aws.dask.{node_type}_memory` = {mem}, " + f"should be between {min_gb * 1024} and {max_gb * 1024} in a multiple of {incr_gb * 1024}." + ) + if errors: + errors.append("See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html") + raise ValidationError("\n".join(errors)) + + return True @staticmethod def validate_project(project_file): super(AwsBatch, AwsBatch).validate_project(project_file) - AwsBatch.validate_instance_types(project_file) + AwsBatch.validate_dask_settings(project_file) @property def docker_image(self): @@ -1121,6 +1135,10 @@ def weather_dir(self): def results_dir(self): return f'{self.s3_bucket}/{self.s3_bucket_prefix}/results' + @property + def output_dir(self): + return f'{self.s3_bucket}/{self.s3_bucket_prefix}' + @property def container_repo(self): repo_name = self.docker_image @@ -1525,18 +1543,19 @@ def get_fs(self): return S3FileSystem() def get_dask_client(self): - # TODO: Add (some of) these to the config and have sensible defaults. + dask_cfg = self.cfg["aws"]["dask"] + batch_env = AwsBatchEnv(self.job_identifier, self.cfg['aws'], self.boto3_session) m = 1024 cluster = FargateCluster( fargate_spot=True, image=self.image_url, cluster_name_template=f"dask-{self.job_identifier}", - scheduler_cpu=2 * m, - scheduler_mem=8 * m, - worker_cpu=2 * m, - worker_mem=8 * m, - n_workers=12, + scheduler_cpu=dask_cfg.get("scheduler_cpu", 2 * m), + scheduler_mem=dask_cfg.get("scheduler_memory", 8 * m), + worker_cpu=dask_cfg.get("worker_cpu", 2 * m), + worker_mem=dask_cfg.get("worker_memory", 8 * m), + n_workers=dask_cfg["n_workers"], task_role_policies=['arn:aws:iam::aws:policy/AmazonS3FullAccess'], tags=batch_env.get_tags() ) diff --git a/buildstockbatch/aws/s3_assets/bsb_post.py b/buildstockbatch/aws/s3_assets/bsb_post.py deleted file mode 100644 index 9d2c8b3d..00000000 --- a/buildstockbatch/aws/s3_assets/bsb_post.py +++ /dev/null @@ -1,69 +0,0 @@ -import argparse -import boto3 -from dask_yarn import YarnCluster -from dask.distributed import Client -import json -from s3fs import S3FileSystem - -from postprocessing import combine_results, create_athena_tables, remove_intermediate_files - - -def do_postprocessing(s3_bucket, s3_bucket_prefix): - - fs = S3FileSystem() - with fs.open(f'{s3_bucket}/{s3_bucket_prefix}/config.json', 'r') as f: - cfg = json.load(f) - - ec2 = boto3.client('ec2') - - with open('/mnt/var/lib/info/job-flow.json', 'r') as f: - job_flow_info = json.load(f) - - for instance_group in job_flow_info['instanceGroups']: - if instance_group['instanceRole'].lower() == 'core': - instance_type = instance_group['instanceType'] - instance_count = instance_group['requestedInstanceCount'] - - instance_info = ec2.describe_instance_types(InstanceTypes=[instance_type]) - - dask_worker_vcores = cfg['aws'].get('emr', {}).get('dask_worker_vcores', 2) - instance_memory = instance_info['InstanceTypes'][0]['MemoryInfo']['SizeInMiB'] - instance_ncpus = instance_info['InstanceTypes'][0]['VCpuInfo']['DefaultVCpus'] - n_dask_workers = instance_count * instance_ncpus // dask_worker_vcores - worker_memory = round(instance_memory / instance_ncpus * dask_worker_vcores * 0.95) - - cluster = YarnCluster( - deploy_mode='local', - worker_vcores=dask_worker_vcores, - worker_memory='{} MiB'.format(worker_memory), - n_workers=n_dask_workers - ) - - client = Client(cluster) # noqa E841 - - results_s3_loc = f'{s3_bucket}/{s3_bucket_prefix}/results' - - combine_results(fs, results_s3_loc, cfg) - - aws_conf = cfg.get('postprocessing', {}).get('aws', {}) - if 'athena' in aws_conf: - tbl_prefix = s3_bucket_prefix.split('/')[-1] - if not tbl_prefix: - tbl_prefix = cfg['aws']['job_identifier'] - create_athena_tables( - aws_conf, - tbl_prefix, - s3_bucket, - f'{s3_bucket_prefix}/results/parquet' - ) - - keep_individual_timeseries = cfg.get('postprocessing', {}).get('keep_individual_timeseries', False) - remove_intermediate_files(fs, results_s3_loc, keep_individual_timeseries) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('s3_bucket') - parser.add_argument('s3_bucket_prefix') - args = parser.parse_args() - do_postprocessing(args.s3_bucket, args.s3_bucket_prefix) diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml index b2bee7e3..cdfa1716 100644 --- a/buildstockbatch/schemas/v0.3.yaml +++ b/buildstockbatch/schemas/v0.3.yaml @@ -23,6 +23,8 @@ aws-spec: spot_bid_percent: num(min=1, max=100, required=False) batch_array_size: num(min=1, max=10000, required=True) notifications_email: regex('^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', name='email', required=True) + emr: include('aws-emr-spec', required=False) + dask: include('aws-dask-spec', required=True) job_environment: include('aws-job-environment', required=False) tags: map(str(), str(), required=False) @@ -30,6 +32,19 @@ aws-job-environment: vcpus: int(min=1, max=36, required=False) memory: int(min=1024, required=False) +aws-emr-spec: + manager_instance_type: str(required=False) + worker_instance_type: str(required=False) + worker_instance_count: int(min=1, required=False) + dask_worker_vcores: int(min=1, required=False) + +aws-dask-spec: + scheduler_cpu: enum(1024, 2048, 4096, 8192, 16384) + scheduler_memory: int(min=1024) + worker_cpu: enum(1024, 2048, 4096, 8192, 16384) + worker_memory: int(min=1024) + n_workers: int(min=1) + hpc-spec: account: str(required=True) minutes_per_sim: int(max=120, required=False) diff --git a/buildstockbatch/test/test_validation.py b/buildstockbatch/test/test_validation.py index 73ce0334..2c645187 100644 --- a/buildstockbatch/test/test_validation.py +++ b/buildstockbatch/test/test_validation.py @@ -18,6 +18,7 @@ import json import pathlib from buildstockbatch.eagle import EagleBatch +from buildstockbatch.aws.aws import AwsBatch from buildstockbatch.localdocker import LocalDockerBatch from buildstockbatch.base import BuildStockBatchBase, ValidationError from buildstockbatch.test.shared_testing_stuff import resstock_directory, resstock_required @@ -25,6 +26,7 @@ from unittest.mock import patch from testfixtures import LogCapture import logging +import shutil here = os.path.dirname(os.path.abspath(__file__)) example_yml_dir = os.path.join(here, 'test_inputs') @@ -54,6 +56,11 @@ def test_local_docker_validation_is_static(): assert isinstance(LocalDockerBatch.validate_project, types.FunctionType) +def test_aws_batch_validation_is_static(): + assert isinstance(AwsBatch.validate_project, types.FunctionType) + assert isinstance(AwsBatch.validate_dask_settings, types.FunctionType) + + def test_complete_schema_passes_validation(): assert BuildStockBatchBase.validate_project_schema(os.path.join(example_yml_dir, 'complete-schema.yml')) @@ -292,3 +299,35 @@ def test_validate_resstock_or_comstock_version(mocker): proj_filename = resstock_directory / "project_national" / "national_upgrades.yml" with pytest.raises(ValidationError): BuildStockBatchBase.validate_resstock_or_comstock_version(str(proj_filename)) + + +def test_dask_config(): + orig_filename = os.path.join(example_yml_dir, 'minimal-schema.yml') + cfg = get_project_configuration(orig_filename) + with tempfile.TemporaryDirectory() as tmpdir: + cfg["aws"] = { + "dask": { + "scheduler_cpu": 1024, + "scheduler_memory": 2048, + "worker_cpu": 1024, + "worker_memory": 2048, + "n_workers": 1 + } + } + test1_filename = os.path.join(tmpdir, "test1.yml") + with open(test1_filename, "w") as f: + json.dump(cfg, f) + AwsBatch.validate_dask_settings(test1_filename) + cfg["aws"]["dask"]["scheduler_memory"] = 9 * 1024 + test2_filename = os.path.join(tmpdir, "test2.yml") + with open(test2_filename, "w") as f: + json.dump(cfg, f) + with pytest.raises(ValidationError, match=r"between 2048 and 8192"): + AwsBatch.validate_dask_settings(test2_filename) + cfg["aws"]["dask"]["scheduler_memory"] = 8 * 1024 + cfg["aws"]["dask"]["worker_memory"] = 1025 + test3_filename = os.path.join(tmpdir, "test3.yml") + with open(test3_filename, "w") as f: + json.dump(cfg, f) + with pytest.raises(ValidationError, match=r"needs to be a multiple of 1024"): + AwsBatch.validate_dask_settings(test3_filename) From 3fe655e289926d148dc9e9302f4f6ff1183e4f18 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 9 Feb 2023 12:17:44 -0700 Subject: [PATCH 17/53] tag the nat gateway --- buildstockbatch/aws/aws.py | 1 + 1 file changed, 1 insertion(+) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 7c133dde..e3100899 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -394,6 +394,7 @@ def create_vpc(self): backoff(self.ec2.create_tags, Resources=[ + self.nat_gateway_id, self.priv_route_table_id ], Tags=self.get_tags_uppercase(Name=self.job_identifier) From b132b3e2c5ec766bd69e55b3f35bd87bcdff69b9 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 9 Feb 2023 20:20:55 +0000 Subject: [PATCH 18/53] fixes for glue crawling --- buildstockbatch/aws/aws.py | 38 +++++++++++++++++++++---------- buildstockbatch/base.py | 38 +++++++++++++++++-------------- buildstockbatch/eagle.py | 2 +- buildstockbatch/localdocker.py | 2 +- buildstockbatch/schemas/v0.3.yaml | 10 ++++---- 5 files changed, 54 insertions(+), 36 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 613c9b63..9cbddb03 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -38,7 +38,6 @@ import time import tqdm import io -import zipfile from buildstockbatch.localdocker import DockerBatchBase from buildstockbatch.base import ValidationError @@ -1100,13 +1099,13 @@ def validate_dask_settings(project_file): 16384: (32, 120, 8), } for node_type in ("scheduler", "worker"): - mem = dask_cfg[f"{node_type}_memory"] + mem = dask_cfg.get(f"{node_type}_memory", 8 * 1024) if mem % 1024 != 0: errors.append( f"`aws.dask.{node_type}_memory` = {mem}, needs to be a multiple of 1024." ) - mem_gb = dask_cfg[f'{node_type}_memory'] // 1024 - min_gb, max_gb, incr_gb = mem_rules[dask_cfg[f"{node_type}_cpu"]] + mem_gb = mem // 1024 + min_gb, max_gb, incr_gb = mem_rules[dask_cfg.get(f"{node_type}_cpu", 2 * 1024)] if not (min_gb <= mem_gb <= max_gb and (mem_gb - min_gb) % incr_gb == 0): errors.append( f"`aws.dask.{node_type}_memory` = {mem}, " @@ -1361,10 +1360,11 @@ def run_batch(self): ) # start job - job_info = batch_env.submit_job(array_size=self.batch_array_size) + job_info = batch_env.submit_job(array_size=array_size) # Monitor job status - with tqdm.tqdm(desc="Running Simulations", total=self.batch_array_size) as progress_bar: + n_succeeded_last_time = 0 + with tqdm.tqdm(desc="Running Simulations", total=array_size) as progress_bar: job_status = None while job_status not in ('SUCCEEDED', 'FAILED'): time.sleep(10) @@ -1375,10 +1375,13 @@ def run_batch(self): n_succeeded = len(jobs_resp["jobSummaryList"]) next_token = jobs_resp.get("nextToken") while next_token is not None: - jobs_resp = batch_env.batch.list_jobs(arrayJobId=job_info['jobId'], jobStatus='SUCCEEDED', nextToken=next_token) + jobs_resp = batch_env.batch.list_jobs( + arrayJobId=job_info['jobId'], jobStatus='SUCCEEDED', nextToken=next_token + ) n_succeeded += len(jobs_resp["jobSummaryList"]) next_token = jobs_resp.get("nextToken") - progress_bar.update(n_succeeded) + progress_bar.update(n_succeeded - n_succeeded_last_time) + n_succeeded_last_time = n_succeeded logger.info(f"Batch job status: {job_status}") if job_status == "FAILED": @@ -1547,7 +1550,7 @@ def get_dask_client(self): batch_env = AwsBatchEnv(self.job_identifier, self.cfg['aws'], self.boto3_session) m = 1024 - cluster = FargateCluster( + self.dask_cluster = FargateCluster( fargate_spot=True, image=self.image_url, cluster_name_template=f"dask-{self.job_identifier}", @@ -1559,12 +1562,16 @@ def get_dask_client(self): task_role_policies=['arn:aws:iam::aws:policy/AmazonS3FullAccess'], tags=batch_env.get_tags() ) - client = Client(cluster) - return client + self.dask_client = Client(self.dask_cluster) + return self.dask_client + + def cleanup_dask(self): + self.dask_client.close() + self.dask_cluster.close() def upload_results(self, *args, **kwargs): """Do nothing because the results are already on S3""" - return self.s3_bucket, self.results_dir + return self.s3_bucket, self.s3_bucket_prefix + "/results/parquet" @log_error_details() @@ -1626,6 +1633,11 @@ def main(): help='Only do postprocessing, useful for when the simulations are already done', action='store_true' ) + group.add_argument( + '--crawl', + help='Only do the crawling in Athena. When simulations and postprocessing are done.', + action='store_true' + ) args = parser.parse_args() # validate the project, and in case of the --validateonly flag return True if validation passes @@ -1640,6 +1652,8 @@ def main(): batch.build_image() batch.push_image() batch.process_results() + elif args.crawl: + batch.process_results(skip_combine=True, use_dask_cluster=False) else: batch.build_image() batch.push_image() diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index 169e8ca7..0f71a6e9 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -792,36 +792,40 @@ def validate_openstudio_version(project_file): def get_dask_client(self): return Client() + def cleanup_dask(self): + pass + def get_fs(self): return LocalFileSystem() def upload_results(self, *args, **kwargs): return postprocessing.upload_results(*args, **kwargs) - def process_results(self, skip_combine=False, force_upload=False): - dask_client = self.get_dask_client() + def process_results(self, skip_combine=False, use_dask_cluster=True): + if use_dask_cluster: + self.get_dask_client() # noqa F841 try: + wfg_args = self.cfg['workflow_generator'].get('args', {}) if self.cfg['workflow_generator']['type'] == 'residential_hpxml': - if 'simulation_output_report' in self.cfg['workflow_generator']['args'].keys(): - if 'timeseries_frequency' in self.cfg['workflow_generator']['args']['simulation_output_report'].keys(): - do_timeseries = \ - (self.cfg['workflow_generator']['args']['simulation_output_report']['timeseries_frequency'] != - 'none') + if 'simulation_output_report' in wfg_args.keys(): + if 'timeseries_frequency' in wfg_args['simulation_output_report'].keys(): + do_timeseries = wfg_args['simulation_output_report']['timeseries_frequency'] != 'none' else: - do_timeseries = 'timeseries_csv_export' in self.cfg['workflow_generator']['args'].keys() + do_timeseries = 'timeseries_csv_export' in wfg_args.keys() fs = self.get_fs() if not skip_combine: postprocessing.combine_results(fs, self.results_dir, self.cfg, do_timeseries=do_timeseries) - finally: - dask_client.close() - aws_conf = self.cfg.get('postprocessing', {}).get('aws', {}) - if 's3' in aws_conf or force_upload: - s3_bucket, s3_prefix = self.upload_results(aws_conf, self.output_dir, self.results_dir) - if 'athena' in aws_conf: - postprocessing.create_athena_tables(aws_conf, os.path.basename(self.output_dir), s3_bucket, s3_prefix) + aws_conf = self.cfg.get('postprocessing', {}).get('aws', {}) + if 's3' in aws_conf or 'aws' in self.cfg: + s3_bucket, s3_prefix = self.upload_results(aws_conf, self.output_dir, self.results_dir) + if 'athena' in aws_conf: + postprocessing.create_athena_tables(aws_conf, os.path.basename(self.output_dir), s3_bucket, s3_prefix) + finally: + if use_dask_cluster: + self.cleanup_dask() - keep_individual_timeseries = self.cfg.get('postprocessing', {}).get('keep_individual_timeseries', False) - postprocessing.remove_intermediate_files(fs, self.results_dir, keep_individual_timeseries) + # keep_individual_timeseries = self.cfg.get('postprocessing', {}).get('keep_individual_timeseries', False) + # postprocessing.remove_intermediate_files(fs, self.results_dir, keep_individual_timeseries) diff --git a/buildstockbatch/eagle.py b/buildstockbatch/eagle.py index 131055e2..3ea34468 100644 --- a/buildstockbatch/eagle.py +++ b/buildstockbatch/eagle.py @@ -840,7 +840,7 @@ def main(): assert not measures_only assert not sampling_only if upload_only: - batch.process_results(skip_combine=True, force_upload=True) + batch.process_results(skip_combine=True) else: batch.process_results() else: diff --git a/buildstockbatch/localdocker.py b/buildstockbatch/localdocker.py index 7f037643..24b32f05 100644 --- a/buildstockbatch/localdocker.py +++ b/buildstockbatch/localdocker.py @@ -384,7 +384,7 @@ def main(): if args.measures_only or args.samplingonly: return if args.uploadonly: - batch.process_results(skip_combine=True, force_upload=True) + batch.process_results(skip_combine=True) else: batch.process_results() diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml index cdfa1716..f1986725 100644 --- a/buildstockbatch/schemas/v0.3.yaml +++ b/buildstockbatch/schemas/v0.3.yaml @@ -39,11 +39,11 @@ aws-emr-spec: dask_worker_vcores: int(min=1, required=False) aws-dask-spec: - scheduler_cpu: enum(1024, 2048, 4096, 8192, 16384) - scheduler_memory: int(min=1024) - worker_cpu: enum(1024, 2048, 4096, 8192, 16384) - worker_memory: int(min=1024) - n_workers: int(min=1) + scheduler_cpu: enum(1024, 2048, 4096, 8192, 16384, required=False) + scheduler_memory: int(min=1024, required=False) + worker_cpu: enum(1024, 2048, 4096, 8192, 16384, required=False) + worker_memory: int(min=1024, required=False) + n_workers: int(min=1, required=True) hpc-spec: account: str(required=True) From ef7d92ccd220e30e0b95e7f3ee54fd160b8602a5 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 9 Feb 2023 20:26:09 +0000 Subject: [PATCH 19/53] fixing linting --- buildstockbatch/aws/aws.py | 6 ++++-- buildstockbatch/base.py | 7 ++++++- buildstockbatch/test/test_validation.py | 1 - buildstockbatch/workflow_generator/residential_hpxml.py | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 060656ca..20d6af9f 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -277,7 +277,8 @@ def create_vpc(self): self.internet_gateway_id = ig_response['InternetGateway']['InternetGatewayId'] - backoff(self.ec2.create_tags, + backoff( + self.ec2.create_tags, Resources=[ self.internet_gateway_id ], @@ -391,7 +392,8 @@ def create_vpc(self): logger.info("Route table created.") - backoff(self.ec2.create_tags, + backoff( + self.ec2.create_tags, Resources=[ self.nat_gateway_id, self.priv_route_table_id diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index 0f71a6e9..011a0634 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -822,7 +822,12 @@ def process_results(self, skip_combine=False, use_dask_cluster=True): if 's3' in aws_conf or 'aws' in self.cfg: s3_bucket, s3_prefix = self.upload_results(aws_conf, self.output_dir, self.results_dir) if 'athena' in aws_conf: - postprocessing.create_athena_tables(aws_conf, os.path.basename(self.output_dir), s3_bucket, s3_prefix) + postprocessing.create_athena_tables( + aws_conf, + os.path.basename(self.output_dir), + s3_bucket, + s3_prefix + ) finally: if use_dask_cluster: self.cleanup_dask() diff --git a/buildstockbatch/test/test_validation.py b/buildstockbatch/test/test_validation.py index 2c645187..987e3260 100644 --- a/buildstockbatch/test/test_validation.py +++ b/buildstockbatch/test/test_validation.py @@ -26,7 +26,6 @@ from unittest.mock import patch from testfixtures import LogCapture import logging -import shutil here = os.path.dirname(os.path.abspath(__file__)) example_yml_dir = os.path.join(here, 'test_inputs') diff --git a/buildstockbatch/workflow_generator/residential_hpxml.py b/buildstockbatch/workflow_generator/residential_hpxml.py index 5946026f..ac6373c9 100644 --- a/buildstockbatch/workflow_generator/residential_hpxml.py +++ b/buildstockbatch/workflow_generator/residential_hpxml.py @@ -139,7 +139,7 @@ def validate(cls, cfg): retain_stdout_expandobject: bool(required=False) retain_schedules_csv: bool(required=False) debug: bool(required=False) - """ + """ # noqa E501 workflow_generator_args = cfg['workflow_generator']['args'] schema_yml = re.sub(r'^ {8}', '', schema_yml, flags=re.MULTILINE) schema = yamale.make_schema(content=schema_yml, parser='ruamel') From 543f72b685c9bd13458b2aea36d61968c960cb00 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 9 Feb 2023 20:34:44 +0000 Subject: [PATCH 20/53] updating ci to install aws extras --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 38a2d110..d6faab49 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,7 +36,7 @@ jobs: run: | cd buildstockbatch python -m pip install --progress-bar off --upgrade pip - pip install .[dev] --progress-bar off + pip install .[dev,aws] --progress-bar off - name: Linting run: | cd buildstockbatch From d7498f586e299a8b51ec2647cc1c2459fda78aee Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 6 Mar 2023 10:02:37 -0700 Subject: [PATCH 21/53] tagging nat gateway --- buildstockbatch/aws/aws.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index ad23a99e..d665452a 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -380,6 +380,12 @@ def create_vpc(self): self.nat_gateway_id = nat_response['NatGateway']['NatGatewayId'] + backoff( + self.ec2.create_tags, + Resources=[self.nat_gateway_id], + Tags=self.get_tags_uppercase(Name=self.job_identifier) + ) + logger.info("NAT Gateway created.") # Create a new private route table From 818893f78530a0222581aae08ec63aab5dd9a24d Mon Sep 17 00:00:00 2001 From: David Rager Date: Mon, 20 Mar 2023 15:56:52 -0600 Subject: [PATCH 22/53] Adding (and cleaning) an S3 VPC gateway. --- buildstockbatch/aws/aws.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index d665452a..46d1b1ca 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -152,6 +152,7 @@ def __init__(self, job_name, aws_config, boto3_session): self.service_role_arn = None self.instance_profile_arn = None self.job_queue_arn = None + self.s3_gateway_endpoint = None logger.propagate = False @@ -438,7 +439,20 @@ def create_vpc(self): logger.info("Nat Gateway not yet created. Sleeping...") else: raise + + gateway_response = self.ec2.create_vpc_endpoint( + VpcId=self.vpc_id, + ServiceName='com.amazonaws.us-west-2.s3', + RouteTableIds=[self.priv_route_table_id], + SubnetId=[self.priv_vpc_subnet_id_1, self.priv_vpc_subnet_id_2], + SecurityGroupIds=[self.batch_security_group], + PrivateDnsEnabled=True, + VpcEndpointType='Gateway', + PolicyDocument='{"Statement": [{"Action": "*", "Effect": "Allow", "Resource": "*", "Principal": "*"}]}' + ) + self.s3_gateway_endpoint = gateway_response['VpcEndpoint']['VpcEndpointId'] + def generate_name_value_inputs(self, var_dictionary): """ Helper to properly format more easily used dictionaries. @@ -925,6 +939,25 @@ def clean(self): for vpc in response['Vpcs']: this_vpc = vpc['VpcId'] + s3gw_response = self.ec2.describe_vpc_endpoints( + Filters=[ + { + 'Name': 'vpc-id', + 'Values': [ + this_vpc + ] + } + ] + ) + + for s3gw in s3gw_response['VpcEndpoints']: + this_s3gw = s3gw['VpcEndpointId'] + + if s3gw['State'] != 'deleted': + self.ec2.delete_nat_gateway( + VpcEndpointIds=this_s3gw + ) + ng_response = self.ec2.describe_nat_gateways( Filters=[ { From 8401004d94875a5984cc00d2f05d197b2e298c3d Mon Sep 17 00:00:00 2001 From: David Rager Date: Wed, 22 Mar 2023 09:06:55 -0600 Subject: [PATCH 23/53] Cleaned up the clean up for vpc gateway for S3 --- buildstockbatch/aws/aws.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 46d1b1ca..2d302918 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -444,9 +444,6 @@ def create_vpc(self): VpcId=self.vpc_id, ServiceName='com.amazonaws.us-west-2.s3', RouteTableIds=[self.priv_route_table_id], - SubnetId=[self.priv_vpc_subnet_id_1, self.priv_vpc_subnet_id_2], - SecurityGroupIds=[self.batch_security_group], - PrivateDnsEnabled=True, VpcEndpointType='Gateway', PolicyDocument='{"Statement": [{"Action": "*", "Effect": "Allow", "Resource": "*", "Principal": "*"}]}' ) @@ -954,8 +951,8 @@ def clean(self): this_s3gw = s3gw['VpcEndpointId'] if s3gw['State'] != 'deleted': - self.ec2.delete_nat_gateway( - VpcEndpointIds=this_s3gw + self.ec2.delete_vpc_endpoints( + VpcEndpointIds=[this_s3gw] ) ng_response = self.ec2.describe_nat_gateways( From 379a6db2a777351cd822cf51cb86dfc26bf0cc47 Mon Sep 17 00:00:00 2001 From: David Rager Date: Mon, 1 May 2023 09:10:18 -0600 Subject: [PATCH 24/53] Lastest modifications per tests. --- buildstockbatch/aws/aws.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 2d302918..ab209d6d 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -152,7 +152,8 @@ def __init__(self, job_name, aws_config, boto3_session): self.service_role_arn = None self.instance_profile_arn = None self.job_queue_arn = None - self.s3_gateway_endpoint = None + self.s3_gateway_endpoint_id = None + self.prefix_list_id = None logger.propagate = False @@ -442,13 +443,25 @@ def create_vpc(self): gateway_response = self.ec2.create_vpc_endpoint( VpcId=self.vpc_id, - ServiceName='com.amazonaws.us-west-2.s3', - RouteTableIds=[self.priv_route_table_id], + ServiceName=f'com.amazonaws.{self.region}.s3', + RouteTableIds=[self.priv_route_table_id, self.pub_route_table_id], VpcEndpointType='Gateway', - PolicyDocument='{"Statement": [{"Action": "*", "Effect": "Allow", "Resource": "*", "Principal": "*"}]}' + PolicyDocument='{"Statement": [{"Action": "*", "Effect": "Allow", "Resource": "*", "Principal": "*"}]}', ) - self.s3_gateway_endpoint = gateway_response['VpcEndpoint']['VpcEndpointId'] + logger.info("S3 gateway created for VPC.") + + self.s3_gateway_endpoint_id = gateway_response['VpcEndpoint']['VpcEndpointId'] + + backoff( + self.ec2.create_tags, + Resources=[ + self.s3_gateway_endpoint_id + ], + Tags=self.get_tags_uppercase(Name=self.job_identifier) + ) + + def generate_name_value_inputs(self, var_dictionary): """ From 712c233a603a4fa43625f563efcfcc0c2b3ac6cf Mon Sep 17 00:00:00 2001 From: Andrew Parker Date: Fri, 9 Jun 2023 09:20:54 -0600 Subject: [PATCH 25/53] Adds custom gems option to AWS docker image build --- Dockerfile | 16 +++- buildstockbatch/aws/aws.py | 58 +++++++++++- buildstockbatch/test/test_aws.py | 89 +++++++++++++++++++ .../resources/Gemfile | 2 +- 4 files changed, 161 insertions(+), 4 deletions(-) create mode 100644 buildstockbatch/test/test_aws.py diff --git a/Dockerfile b/Dockerfile index b8b248db..9a374264 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,21 @@ ARG OS_VER=3.5.0 -FROM --platform=linux/amd64 nrel/openstudio:$OS_VER +FROM --platform=linux/amd64 nrel/openstudio:$OS_VER as buildstockbatch RUN sudo apt update && sudo apt install -y python3-pip RUN sudo -H pip install --upgrade pip COPY . /buildstock-batch/ RUN python3 -m pip install "/buildstock-batch[aws]" + +# Base plus custom gems +FROM buildstockbatch as buildstockbatch-custom-gems +RUN sudo cp /buildstock-batch/Gemfile /var/oscli/ +# OpenStudio's docker image sets ENV BUNDLE_WITHOUT=native_ext +# https://github.com/NREL/docker-openstudio/blob/3.2.1/Dockerfile#L12 +# which overrides anything set via bundle config commands. +# Unset this so that bundle config commands work properly. +RUN unset BUNDLE_WITHOUT +# Note the addition of 'set' in bundle config commands +RUN bundle config set git.allow_insecure true +RUN bundle config set path /var/oscli/gems/ +RUN bundle config set without 'test development native_ext' +RUN bundle install --gemfile /var/oscli/Gemfile diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index d665452a..927a8419 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -1194,13 +1194,67 @@ def build_image(self): root_path = pathlib.Path(os.path.abspath(__file__)).parent.parent.parent if not (root_path / 'Dockerfile').exists(): raise RuntimeError(f'The needs to be run from the root of the repo, found {root_path}') - logger.debug('Building docker image') - self.docker_client.images.build( + + # Make the buildstock/resources/.aws_docker_image dir to store logs + local_log_dir = os.path.join(self.buildstock_dir, 'resources', '.aws_docker_image') + if not os.path.exists(local_log_dir): + os.makedirs(local_log_dir) + + # Determine whether or not to build the image with custom gems bundled in + if self.cfg.get('baseline', dict()).get('custom_gems', False): + # Ensure the custom Gemfile exists in the buildstock dir + local_gemfile_path = os.path.join(self.buildstock_dir, 'resources', 'Gemfile') + if not os.path.exists(local_gemfile_path): + raise AttributeError(f'baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}') + + # Copy the custom Gemfile into the buildstockbatch repo + bsb_root = os.path.join(os.path.abspath(__file__), os.pardir, os.pardir, os.pardir) + new_gemfile_path = os.path.join(bsb_root, 'Gemfile') + shutil.copyfile(local_gemfile_path, new_gemfile_path) + logger.info(f'Copying custom Gemfile from {local_gemfile_path}') + + # Choose the custom-gems stage in the Dockerfile, + # which runs bundle install to build custom gems into the image + stage = 'buildstockbatch-custom-gems' + else: + # Choose the base stage in the Dockerfile, + # which stops before bundling custom gems into the image + stage = 'buildstockbatch' + + logger.debug(f'Building docker image stage: {stage}') + img, build_logs = self.docker_client.images.build( path=str(root_path), tag=self.docker_image, rm=True, + target=stage, platform="linux/amd64" ) + build_image_log = os.path.join(local_log_dir, 'build_image.log') + with open(build_image_log, 'w') as f_out: + f_out.write('Built image') + # for line in build_logs: + # for k, v in line.items(): + # f_out.write(f'{k}: {v}') + logger.debug(f'Review docker image build log: {build_image_log}') + + # Report gems included in the docker image. + # The OpenStudio Docker image installs the default gems + # to /var/oscli/gems, and the custom docker image + # overwrites these with the custom gems. + list_gems_cmd = 'openstudio --bundle /var/oscli/Gemfile --bundle_path /var/oscli/gems ' \ + '--bundle_without native_ext gem_list' + container_output = self.docker_client.containers.run( + self.docker_image, + list_gems_cmd, + remove=True, + name='list_gems' + ) + gem_list_log = os.path.join(local_log_dir, 'openstudio_gem_list_output.log') + with open(gem_list_log, 'wb') as f_out: + f_out.write(container_output) + for line in container_output.decode().split('\n'): + logger.debug(line) + logger.debug(f'Review custom gems list at: {gem_list_log}') def push_image(self): """ diff --git a/buildstockbatch/test/test_aws.py b/buildstockbatch/test/test_aws.py new file mode 100644 index 00000000..e7f4c24d --- /dev/null +++ b/buildstockbatch/test/test_aws.py @@ -0,0 +1,89 @@ +import os +import yaml +import logging + +from buildstockbatch.aws.aws import AwsBatch + +here = os.path.dirname(os.path.abspath(__file__)) +logging.basicConfig(level='DEBUG') # Use DEBUG, INFO, or WARNING +logger = logging.getLogger(__name__) + + +def test_custom_gem_install(basic_residential_project_file): + project_filename, results_dir = basic_residential_project_file() + + # Add aws and custom_gems to the project file + with open(project_filename, 'r') as f: + cfg = yaml.safe_load(f) + # custom_gems + cfg['baseline']['custom_gems'] = True + # AWS + cfg['aws'] = {} + cfg['aws']['job_identifier'] = 'testaws' + cfg['aws']['s3'] = {} + cfg['aws']['s3']['bucket'] = 'resbldg-datasets' + cfg['aws']['s3']['prefix'] = 'testing/external_demo_project' + cfg['aws']['emr'] = {} + cfg['aws']['emr']['manager_instance_type'] = 'm5.xlarge' + cfg['aws']['emr']['worker_instance_type'] = 'r5.4xlarge' + cfg['aws']['emr']['worker_instance_count'] = 1 + cfg['aws']['region'] = 'us-west-2' + cfg['aws']['use_spot'] = True + cfg['aws']['batch_array_size'] = 100 + cfg['aws']['notifications_email'] = 'user@example.com' + with open(project_filename, 'w') as f: + yaml.dump(cfg, f) + + buildstock_directory = cfg['buildstock_directory'] + + batch = AwsBatch(project_filename) + batch.build_image() + + gem_list_log_log_path = os.path.join(buildstock_directory, + 'resources', + '.aws_docker_image', + 'openstudio_gem_list_output.log') + assert os.path.exists(gem_list_log_log_path) + with open(gem_list_log_log_path, 'r') as gem_list: + contents = gem_list.read() + custom_gem = '/var/oscli/gems/ruby/2.7.0/gems/openstudio-standards-0.2.0' + assert custom_gem in contents + + +def test_no_custom_gem_install(basic_residential_project_file): + project_filename, results_dir = basic_residential_project_file() + + # Add aws to the project file + with open(project_filename, 'r') as f: + cfg = yaml.safe_load(f) + # AWS + cfg['aws'] = {} + cfg['aws']['job_identifier'] = 'testaws' + cfg['aws']['s3'] = {} + cfg['aws']['s3']['bucket'] = 'resbldg-datasets' + cfg['aws']['s3']['prefix'] = 'testing/external_demo_project' + cfg['aws']['emr'] = {} + cfg['aws']['emr']['manager_instance_type'] = 'm5.xlarge' + cfg['aws']['emr']['worker_instance_type'] = 'r5.4xlarge' + cfg['aws']['emr']['worker_instance_count'] = 1 + cfg['aws']['region'] = 'us-west-2' + cfg['aws']['use_spot'] = True + cfg['aws']['batch_array_size'] = 100 + cfg['aws']['notifications_email'] = 'user@example.com' + with open(project_filename, 'w') as f: + yaml.dump(cfg, f) + + buildstock_directory = cfg['buildstock_directory'] + + batch = AwsBatch(project_filename) + batch.build_image() + + gem_list_log_log_path = os.path.join(buildstock_directory, + 'resources', + '.aws_docker_image', + 'openstudio_gem_list_output.log') + assert os.path.exists(gem_list_log_log_path) + with open(gem_list_log_log_path, 'r') as gem_list: + contents = gem_list.read() + custom_gem = '/var/oscli/gems/ruby/2.7.0/gems/openstudio-standards-0.2.0' + assert custom_gem not in contents diff --git a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/Gemfile b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/Gemfile index f8096f35..ffadc77f 100644 --- a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/Gemfile +++ b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/Gemfile @@ -14,7 +14,7 @@ ruby "~> 2.7.0" gem 'openstudio-extension', '= 0.5.1' gem 'openstudio-workflow', '= 2.3.1' -gem 'openstudio-standards', '= 0.2.16' +gem 'openstudio-standards', '= 0.2.0' # Deliberately obsolete version to ensure custom gems works as expected # gem 'openstudio-standards', git: 'https://github.com/NREL/openstudio-standards.git', ref: '971514ee0a64262a9c81788fd85fc60d8dd69980' group :native_ext do From a9a240aa23795c37a5713812387b799e154bb20c Mon Sep 17 00:00:00 2001 From: Andrew Parker Date: Fri, 9 Jun 2023 11:32:45 -0600 Subject: [PATCH 26/53] Activate custom gems when running AWS simulation if configured --- buildstockbatch/aws/aws.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 927a8419..2dff7e50 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -1565,8 +1565,18 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): with open(sim_dir / 'os_stdout.log', 'w') as f_out: try: logger.debug('Running {}'.format(sim_id)) + cli_cmd = ['openstudio', 'run', '-w', 'in.osw'] + if cfg.get('baseline', dict()).get('custom_gems', False): + cli_cmd = [ + 'openstudio', + '--bundle', '/var/oscli/Gemfile', + '--bundle_path', '/var/oscli/gems', + '--bundle_without', 'native_ext', + 'run', '-w', 'in.osw', + '--debug' + ] subprocess.run( - ['openstudio', 'run', '-w', 'in.osw'], + cli_cmd, check=True, stdout=f_out, stderr=subprocess.STDOUT, From ff9d5b85b349d560900fc63d64ff1c3dc55e9f7a Mon Sep 17 00:00:00 2001 From: Andrew Parker Date: Fri, 9 Jun 2023 12:48:49 -0600 Subject: [PATCH 27/53] Pull AWS base OpenStudio docker image version from config or bsb default --- Dockerfile | 2 +- buildstockbatch/aws/aws.py | 26 +++++++++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9a374264..27335de8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG OS_VER=3.5.0 +ARG OS_VER FROM --platform=linux/amd64 nrel/openstudio:$OS_VER as buildstockbatch RUN sudo apt update && sudo apt install -y python3-pip diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 2dff7e50..4bd54d55 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -1221,22 +1221,38 @@ def build_image(self): # which stops before bundling custom gems into the image stage = 'buildstockbatch' - logger.debug(f'Building docker image stage: {stage}') + logger.info(f'Building docker image stage: {stage} from OpenStudio {self.os_version}') img, build_logs = self.docker_client.images.build( path=str(root_path), tag=self.docker_image, rm=True, target=stage, - platform="linux/amd64" + platform="linux/amd64", + buildargs={'OS_VER': self.os_version} ) build_image_log = os.path.join(local_log_dir, 'build_image.log') + build_image_log = f'C:/Scratch/ComStock/efforts/aws_testing/build_image_{stage}.log' with open(build_image_log, 'w') as f_out: f_out.write('Built image') - # for line in build_logs: - # for k, v in line.items(): - # f_out.write(f'{k}: {v}') + for line in build_logs: + for itm_type, item_msg in line.items(): + if itm_type in ['stream', 'status']: + try: + f_out.write(f'{item_msg}') + except UnicodeEncodeError: + pass logger.debug(f'Review docker image build log: {build_image_log}') + # Report and confirm the openstudio version from the image + os_ver_cmd = 'openstudio openstudio_version' + container_output = self.docker_client.containers.run( + self.docker_image, + os_ver_cmd, + remove=True, + name='list_openstudio_version' + ) + assert self.os_version in container_output.decode() + # Report gems included in the docker image. # The OpenStudio Docker image installs the default gems # to /var/oscli/gems, and the custom docker image From 10db1adde74c917073e53051586911d06e6afc89 Mon Sep 17 00:00:00 2001 From: Andrew Parker Date: Fri, 14 Jul 2023 07:04:01 -0600 Subject: [PATCH 28/53] Removes local testing path --- buildstockbatch/aws/aws.py | 1 - 1 file changed, 1 deletion(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 4bd54d55..9fac05f8 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -1231,7 +1231,6 @@ def build_image(self): buildargs={'OS_VER': self.os_version} ) build_image_log = os.path.join(local_log_dir, 'build_image.log') - build_image_log = f'C:/Scratch/ComStock/efforts/aws_testing/build_image_{stage}.log' with open(build_image_log, 'w') as f_out: f_out.write('Built image') for line in build_logs: From 65080670cc43ddd252d4025caffb8280b99a85b0 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Fri, 1 Sep 2023 21:12:39 +0000 Subject: [PATCH 29/53] adding some backoffs --- buildstockbatch/aws/aws.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index ab209d6d..b8dda1ac 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -261,14 +261,16 @@ def create_vpc(self): logger.info("Private subnet created.") - self.ec2.create_tags( + backoff( + self.ec2.create_tags, Resources=[ self.priv_vpc_subnet_id_1 ], Tags=self.get_tags_uppercase(Name=self.job_identifier) ) - self.ec2.create_tags( + backoff( + self.ec2.create_tags, Resources=[ self.priv_vpc_subnet_id_2 ], @@ -300,7 +302,8 @@ def create_vpc(self): self.pub_vpc_subnet_id = pub_response['Subnet']['SubnetId'] - self.ec2.create_tags( + backoff( + self.ec2.create_tags, Resources=[ self.pub_vpc_subnet_id ], @@ -1626,6 +1629,7 @@ def get_dask_client(self): batch_env = AwsBatchEnv(self.job_identifier, self.cfg['aws'], self.boto3_session) m = 1024 self.dask_cluster = FargateCluster( + region_name=self.region, fargate_spot=True, image=self.image_url, cluster_name_template=f"dask-{self.job_identifier}", From 9330d85a542de28b043b59443863a2ac02c3c599 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 19 Oct 2023 22:31:09 +0000 Subject: [PATCH 30/53] saving non-working micromamba in docker stuff --- Dockerfile | 21 +++++++++++++++++---- buildstockbatch/aws/aws.py | 7 ++++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index b8b248db..eeb52d2e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,20 @@ -ARG OS_VER=3.5.0 +ARG OS_VER=3.6.1 +ARG PYTHON_VER=3.11.5 FROM --platform=linux/amd64 nrel/openstudio:$OS_VER -RUN sudo apt update && sudo apt install -y python3-pip -RUN sudo -H pip install --upgrade pip +RUN curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba && \ + mv bin/micromamba /usr/local/bin/ && \ + rm -rf bin && \ + micromamba shell init -s bash -p /opt/micromamba && \ + micromamba config append channels conda-forge && \ + micromamba config append channels nodefaults && \ + micromamba config set channel_priority strict COPY . /buildstock-batch/ -RUN python3 -m pip install "/buildstock-batch[aws]" +RUN eval "$( micromamba shell hook --shell=bash /opt/micromamba )" && \ + micromamba activate /opt/micromamba && \ + micromamba install -y python=$PYTHON_VER && \ + python -m pip install "/buildstock-batch[aws]" + + + +# sed -i '/[ -z "\$PS1" ] && return/d' /root/.bashrc diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 43353825..281fbc42 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -28,6 +28,7 @@ import math import os import pathlib +import platform import random from s3fs import S3FileSystem import shutil @@ -1244,7 +1245,11 @@ def build_image(self): path=str(root_path), tag=self.docker_image, rm=True, - platform="linux/amd64" + platform="linux/amd64", + buildargs={ + 'OS_VER': self.os_version, + 'PYTHON_VER': platform.python_version() + } ) def push_image(self): From 3c5cc548d948222047b35322381b50aa6a34e6f1 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Wed, 25 Oct 2023 10:29:55 -0600 Subject: [PATCH 31/53] Revert "saving non-working micromamba in docker stuff" This reverts commit 9330d85a542de28b043b59443863a2ac02c3c599. --- Dockerfile | 21 ++++----------------- buildstockbatch/aws/aws.py | 7 +------ 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/Dockerfile b/Dockerfile index eeb52d2e..b8b248db 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,7 @@ -ARG OS_VER=3.6.1 -ARG PYTHON_VER=3.11.5 +ARG OS_VER=3.5.0 FROM --platform=linux/amd64 nrel/openstudio:$OS_VER -RUN curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba && \ - mv bin/micromamba /usr/local/bin/ && \ - rm -rf bin && \ - micromamba shell init -s bash -p /opt/micromamba && \ - micromamba config append channels conda-forge && \ - micromamba config append channels nodefaults && \ - micromamba config set channel_priority strict +RUN sudo apt update && sudo apt install -y python3-pip +RUN sudo -H pip install --upgrade pip COPY . /buildstock-batch/ -RUN eval "$( micromamba shell hook --shell=bash /opt/micromamba )" && \ - micromamba activate /opt/micromamba && \ - micromamba install -y python=$PYTHON_VER && \ - python -m pip install "/buildstock-batch[aws]" - - - -# sed -i '/[ -z "\$PS1" ] && return/d' /root/.bashrc +RUN python3 -m pip install "/buildstock-batch[aws]" diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 281fbc42..43353825 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -28,7 +28,6 @@ import math import os import pathlib -import platform import random from s3fs import S3FileSystem import shutil @@ -1245,11 +1244,7 @@ def build_image(self): path=str(root_path), tag=self.docker_image, rm=True, - platform="linux/amd64", - buildargs={ - 'OS_VER': self.os_version, - 'PYTHON_VER': platform.python_version() - } + platform="linux/amd64" ) def push_image(self): From 2e472e889e1abec9187e2ac7388ce3ad36cb3848 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 29 Jan 2024 16:43:49 +0000 Subject: [PATCH 32/53] reformatting with black --- buildstockbatch/__version__.py | 20 +- buildstockbatch/aws/aws.py | 1190 +++++++++-------- buildstockbatch/aws/awsbase.py | 168 +-- buildstockbatch/base.py | 620 ++++++--- buildstockbatch/eagle.py | 704 +++++----- buildstockbatch/local.py | 302 +++-- buildstockbatch/postprocessing.py | 592 ++++---- buildstockbatch/sampler/__init__.py | 5 +- buildstockbatch/sampler/base.py | 17 +- buildstockbatch/sampler/commercial_sobol.py | 125 +- buildstockbatch/sampler/downselect.py | 48 +- buildstockbatch/sampler/precomputed.py | 10 +- buildstockbatch/sampler/residential_quota.py | 89 +- buildstockbatch/sampler/sobol_lib.py | 472 +++++-- buildstockbatch/test/conftest.py | 114 +- buildstockbatch/test/shared_testing_stuff.py | 8 +- buildstockbatch/test/test_aws.py | 96 +- buildstockbatch/test/test_base.py | 317 +++-- buildstockbatch/test/test_docker.py | 25 +- buildstockbatch/test/test_eagle.py | 490 ++++--- buildstockbatch/test/test_local.py | 76 +- buildstockbatch/test/test_postprocessing.py | 104 +- buildstockbatch/test/test_utils.py | 29 +- buildstockbatch/test/test_validation.py | 362 +++-- buildstockbatch/utils.py | 52 +- .../workflow_generator/__init__.py | 2 +- buildstockbatch/workflow_generator/base.py | 14 +- .../workflow_generator/commercial.py | 149 ++- .../workflow_generator/residential_hpxml.py | 570 ++++---- .../test_workflow_generator.py | 464 ++++--- docs/conf.py | 103 +- setup.py | 121 +- 32 files changed, 4394 insertions(+), 3064 deletions(-) diff --git a/buildstockbatch/__version__.py b/buildstockbatch/__version__.py index cfab5a93..b5750e31 100644 --- a/buildstockbatch/__version__.py +++ b/buildstockbatch/__version__.py @@ -1,12 +1,14 @@ import datetime as dt -__title__ = 'buildstockbatch' -__description__ = 'Executing BuildStock projects on batch infrastructure.' -__url__ = 'http://github.com/NREL/buildstockbatch' -__version__ = '2023.10.0' -__schema_version__ = '0.3' -__author__ = 'Noel Merket' -__author_email__ = 'noel.merket@nrel.gov' -__license__ = 'BSD-3' -__copyright__ = 'Copyright {} The Alliance for Sustainable Energy'.format(dt.date.today().year) +__title__ = "buildstockbatch" +__description__ = "Executing BuildStock projects on batch infrastructure." +__url__ = "http://github.com/NREL/buildstockbatch" +__version__ = "2023.10.0" +__schema_version__ = "0.3" +__author__ = "Noel Merket" +__author_email__ = "noel.merket@nrel.gov" +__license__ = "BSD-3" +__copyright__ = "Copyright {} The Alliance for Sustainable Energy".format( + dt.date.today().year +) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index e33e029c..5d02493b 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -42,7 +42,12 @@ from buildstockbatch.base import ValidationError, BuildStockBatchBase from buildstockbatch.aws.awsbase import AwsJobBase, boto_client_config from buildstockbatch import postprocessing -from buildstockbatch.utils import ContainerRuntime, log_error_details, get_project_configuration, read_csv +from buildstockbatch.utils import ( + ContainerRuntime, + log_error_details, + get_project_configuration, + read_csv, +) logger = logging.getLogger(__name__) @@ -51,9 +56,7 @@ def backoff(thefunc, *args, **kwargs): backoff_mult = 1.1 delay = 3 tries = 5 - error_patterns = [ - r"\w+.NotFound" - ] + error_patterns = [r"\w+.NotFound"] while tries > 0: try: result = thefunc(*args, **kwargs) @@ -62,7 +65,9 @@ def backoff(thefunc, *args, **kwargs): caught_error = False for pat in error_patterns: if re.search(pat, error_code): - logger.debug(f"{error_code}: Waiting and retrying in {delay} seconds") + logger.debug( + f"{error_code}: Waiting and retrying in {delay} seconds" + ) caught_error = True time.sleep(delay) delay *= backoff_mult @@ -75,7 +80,7 @@ def backoff(thefunc, *args, **kwargs): def upload_file_to_s3(*args, **kwargs): - s3 = boto3.client('s3', config=boto_client_config) + s3 = boto3.client("s3", config=boto_client_config) s3.upload_file(*args, **kwargs) @@ -85,42 +90,36 @@ def upload_directory_to_s3(local_directory, bucket, prefix): def filename_generator(): for dirpath, dirnames, filenames in os.walk(local_dir_abs): for filename in filenames: - if filename.startswith('.'): + if filename.startswith("."): continue local_filepath = pathlib.Path(dirpath, filename) s3_key = pathlib.PurePosixPath( - prefix, - local_filepath.relative_to(local_dir_abs) + prefix, local_filepath.relative_to(local_dir_abs) ) yield local_filepath, s3_key - logger.debug('Uploading {} => {}/{}'.format(local_dir_abs, bucket, prefix)) + logger.debug("Uploading {} => {}/{}".format(local_dir_abs, bucket, prefix)) Parallel(n_jobs=-1, verbose=9)( delayed(upload_file_to_s3)(str(local_file), bucket, s3_key.as_posix()) - for local_file, s3_key - in filename_generator() + for local_file, s3_key in filename_generator() ) def compress_file(in_filename, out_filename): - with gzip.open(str(out_filename), 'wb') as f_out: - with open(str(in_filename), 'rb') as f_in: + with gzip.open(str(out_filename), "wb") as f_out: + with open(str(in_filename), "rb") as f_in: shutil.copyfileobj(f_in, f_out) def calc_hash_for_file(filename): - with open(filename, 'rb') as f: + with open(filename, "rb") as f: return hashlib.sha256(f.read()).hexdigest() def copy_s3_file(src_bucket, src_key, dest_bucket, dest_key): - s3 = boto3.client('s3', config=boto_client_config) - s3.copy( - {'Bucket': src_bucket, 'Key': src_key}, - dest_bucket, - dest_key - ) + s3 = boto3.client("s3", config=boto_client_config) + s3.copy({"Bucket": src_bucket, "Key": src_key}, dest_bucket, dest_key) class AwsBatchEnv(AwsJobBase): @@ -136,13 +135,15 @@ def __init__(self, job_name, aws_config, boto3_session): """ super().__init__(job_name, aws_config, boto3_session) - self.batch = self.session.client('batch', config=boto_client_config) - self.ec2 = self.session.client('ec2', config=boto_client_config) - self.ec2r = self.session.resource('ec2', config=boto_client_config) - self.step_functions = self.session.client('stepfunctions', config=boto_client_config) - self.aws_lambda = self.session.client('lambda', config=boto_client_config) - self.s3 = self.session.client('s3', config=boto_client_config) - self.s3_res = self.session.resource('s3', config=boto_client_config) + self.batch = self.session.client("batch", config=boto_client_config) + self.ec2 = self.session.client("ec2", config=boto_client_config) + self.ec2r = self.session.resource("ec2", config=boto_client_config) + self.step_functions = self.session.client( + "stepfunctions", config=boto_client_config + ) + self.aws_lambda = self.session.client("lambda", config=boto_client_config) + self.s3 = self.session.client("s3", config=boto_client_config) + self.s3_res = self.session.resource("s3", config=boto_client_config) self.task_role_arn = None self.job_definition_arn = None @@ -163,41 +164,39 @@ def __repr__(self): def create_vpc(self): cidrs_in_use = set() vpc_response = self.ec2.describe_vpcs() - for vpc in vpc_response['Vpcs']: - cidrs_in_use.add(vpc['CidrBlock']) - for cidr_assoc in vpc['CidrBlockAssociationSet']: - cidrs_in_use.add(cidr_assoc['CidrBlock']) + for vpc in vpc_response["Vpcs"]: + cidrs_in_use.add(vpc["CidrBlock"]) + for cidr_assoc in vpc["CidrBlockAssociationSet"]: + cidrs_in_use.add(cidr_assoc["CidrBlock"]) need_to_find_cidr = True while need_to_find_cidr: - self.vpc_cidr = '172.{}.0.0/16'.format(random.randrange(100, 200)) + self.vpc_cidr = "172.{}.0.0/16".format(random.randrange(100, 200)) need_to_find_cidr = self.vpc_cidr in cidrs_in_use - self.pub_subnet_cidr = self.vpc_cidr.replace('/16', '/17') - self.priv_subnet_cidr_1 = self.vpc_cidr.replace('.0.0/16', '.128.0/18') - self.priv_subnet_cidr_2 = self.vpc_cidr.replace('.0.0/16', '.192.0/18') + self.pub_subnet_cidr = self.vpc_cidr.replace("/16", "/17") + self.priv_subnet_cidr_1 = self.vpc_cidr.replace(".0.0/16", ".128.0/18") + self.priv_subnet_cidr_2 = self.vpc_cidr.replace(".0.0/16", ".192.0/18") # Create the VPC response = self.ec2.create_vpc( CidrBlock=self.vpc_cidr, AmazonProvidedIpv6CidrBlock=False, - InstanceTenancy='default' + InstanceTenancy="default", ) - self.vpc_id = response['Vpc']['VpcId'] + self.vpc_id = response["Vpc"]["VpcId"] logger.info(f"VPC {self.vpc_id} created") while True: try: self.ec2.create_tags( - Resources=[ - self.vpc_id - ], - Tags=self.get_tags_uppercase(Name=self.job_identifier) + Resources=[self.vpc_id], + Tags=self.get_tags_uppercase(Name=self.job_identifier), ) break except Exception as e: - if 'InvalidVpcID.NotFound' in str(e): + if "InvalidVpcID.NotFound" in str(e): logger.info("Cannot tag VPC. VPC not yet created. Sleeping...") time.sleep(5) else: @@ -207,35 +206,26 @@ def create_vpc(self): sec_response = self.ec2.describe_security_groups( Filters=[ - { - 'Name': 'vpc-id', - 'Values': [ - self.vpc_id - ] - }, + {"Name": "vpc-id", "Values": [self.vpc_id]}, ] ) - self.batch_security_group = sec_response['SecurityGroups'][0]['GroupId'] + self.batch_security_group = sec_response["SecurityGroups"][0]["GroupId"] - logger.info(f'Security group {self.batch_security_group} created for vpc/job.') + logger.info(f"Security group {self.batch_security_group} created for vpc/job.") response = self.ec2.authorize_security_group_ingress( - GroupId=self.batch_security_group, IpPermissions=[ { - 'FromPort': 0, - 'IpProtocol': 'tcp', - 'IpRanges': [ - { - 'CidrIp': '0.0.0.0/0' - }, + "FromPort": 0, + "IpProtocol": "tcp", + "IpRanges": [ + {"CidrIp": "0.0.0.0/0"}, ], - - 'ToPort': 65535 + "ToPort": 65535, }, - ] + ], ) # Create the private subnets @@ -243,100 +233,86 @@ def create_vpc(self): priv_response_1 = self.ec2.create_subnet( CidrBlock=self.priv_subnet_cidr_1, AvailabilityZone=f"{self.region}a", - VpcId=self.vpc_id + VpcId=self.vpc_id, ) - self.priv_vpc_subnet_id_1 = priv_response_1['Subnet']['SubnetId'] + self.priv_vpc_subnet_id_1 = priv_response_1["Subnet"]["SubnetId"] logger.info("Private subnet created.") priv_response_2 = self.ec2.create_subnet( CidrBlock=self.priv_subnet_cidr_2, AvailabilityZone=f"{self.region}b", - VpcId=self.vpc_id + VpcId=self.vpc_id, ) - self.priv_vpc_subnet_id_2 = priv_response_2['Subnet']['SubnetId'] + self.priv_vpc_subnet_id_2 = priv_response_2["Subnet"]["SubnetId"] logger.info("Private subnet created.") backoff( self.ec2.create_tags, - Resources=[ - self.priv_vpc_subnet_id_1 - ], - Tags=self.get_tags_uppercase(Name=self.job_identifier) + Resources=[self.priv_vpc_subnet_id_1], + Tags=self.get_tags_uppercase(Name=self.job_identifier), ) backoff( self.ec2.create_tags, - Resources=[ - self.priv_vpc_subnet_id_2 - ], - Tags=self.get_tags_uppercase(Name=self.job_identifier) + Resources=[self.priv_vpc_subnet_id_2], + Tags=self.get_tags_uppercase(Name=self.job_identifier), ) ig_response = self.ec2.create_internet_gateway() - self.internet_gateway_id = ig_response['InternetGateway']['InternetGatewayId'] + self.internet_gateway_id = ig_response["InternetGateway"]["InternetGatewayId"] backoff( self.ec2.create_tags, - Resources=[ - self.internet_gateway_id - ], - Tags=self.get_tags_uppercase(Name=self.job_identifier) + Resources=[self.internet_gateway_id], + Tags=self.get_tags_uppercase(Name=self.job_identifier), ) - logger.info(f'Internet gateway {self.internet_gateway_id} created.') + logger.info(f"Internet gateway {self.internet_gateway_id} created.") # Create the public subnet pub_response = self.ec2.create_subnet( - CidrBlock=self.pub_subnet_cidr, - VpcId=self.vpc_id + CidrBlock=self.pub_subnet_cidr, VpcId=self.vpc_id ) logger.info("EIP allocated.") - self.pub_vpc_subnet_id = pub_response['Subnet']['SubnetId'] + self.pub_vpc_subnet_id = pub_response["Subnet"]["SubnetId"] backoff( self.ec2.create_tags, - Resources=[ - self.pub_vpc_subnet_id - ], - Tags=self.get_tags_uppercase(Name=self.job_identifier) + Resources=[self.pub_vpc_subnet_id], + Tags=self.get_tags_uppercase(Name=self.job_identifier), ) # Create and elastic IP for the NAT Gateway try: - ip_response = self.ec2.allocate_address( - Domain='vpc' - ) + ip_response = self.ec2.allocate_address(Domain="vpc") - self.nat_ip_allocation = ip_response['AllocationId'] + self.nat_ip_allocation = ip_response["AllocationId"] logger.info("EIP allocated.") self.ec2.create_tags( - Resources=[ - self.nat_ip_allocation - ], - Tags=self.get_tags_uppercase(Name=self.job_identifier) + Resources=[self.nat_ip_allocation], + Tags=self.get_tags_uppercase(Name=self.job_identifier), ) except Exception as e: - if 'AddressLimitExceeded' in str(e): + if "AddressLimitExceeded" in str(e): raise # Create an internet gateway self.ec2.attach_internet_gateway( - InternetGatewayId=self.internet_gateway_id, - VpcId=self.vpc_id + InternetGatewayId=self.internet_gateway_id, VpcId=self.vpc_id ) logger.info("Internet Gateway attached.") @@ -345,31 +321,26 @@ def create_vpc(self): drt_response = self.ec2.describe_route_tables( Filters=[ - { - 'Name': 'vpc-id', - 'Values': [ - self.vpc_id - ] - }, + {"Name": "vpc-id", "Values": [self.vpc_id]}, ] ) - self.pub_route_table_id = drt_response['RouteTables'][0]['RouteTableId'] + self.pub_route_table_id = drt_response["RouteTables"][0]["RouteTableId"] # Modify the default route table to be used as the public route while True: try: self.ec2.create_route( - DestinationCidrBlock='0.0.0.0/0', + DestinationCidrBlock="0.0.0.0/0", GatewayId=self.internet_gateway_id, - RouteTableId=self.pub_route_table_id + RouteTableId=self.pub_route_table_id, ) logger.info("Route created for Internet Gateway.") break except Exception as e: - if 'NotFound' in str(e): + if "NotFound" in str(e): time.sleep(5) logger.info("Internet Gateway not yet created. Sleeping...") else: @@ -378,50 +349,42 @@ def create_vpc(self): # Create a NAT Gateway nat_response = self.ec2.create_nat_gateway( - AllocationId=self.nat_ip_allocation, - SubnetId=self.pub_vpc_subnet_id + AllocationId=self.nat_ip_allocation, SubnetId=self.pub_vpc_subnet_id ) - self.nat_gateway_id = nat_response['NatGateway']['NatGatewayId'] + self.nat_gateway_id = nat_response["NatGateway"]["NatGatewayId"] backoff( self.ec2.create_tags, Resources=[self.nat_gateway_id], - Tags=self.get_tags_uppercase(Name=self.job_identifier) + Tags=self.get_tags_uppercase(Name=self.job_identifier), ) logger.info("NAT Gateway created.") # Create a new private route table - prt_response = self.ec2.create_route_table( - VpcId=self.vpc_id - ) + prt_response = self.ec2.create_route_table(VpcId=self.vpc_id) - self.priv_route_table_id = prt_response['RouteTable']['RouteTableId'] + self.priv_route_table_id = prt_response["RouteTable"]["RouteTableId"] logger.info("Route table created.") backoff( self.ec2.create_tags, - Resources=[ - self.nat_gateway_id, - self.priv_route_table_id - ], - Tags=self.get_tags_uppercase(Name=self.job_identifier) + Resources=[self.nat_gateway_id, self.priv_route_table_id], + Tags=self.get_tags_uppercase(Name=self.job_identifier), ) # Associate the private route to the private subnet self.ec2.associate_route_table( - RouteTableId=self.priv_route_table_id, - SubnetId=self.priv_vpc_subnet_id_1 + RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_1 ) logger.info("Route table associated with subnet.") self.ec2.associate_route_table( - RouteTableId=self.priv_route_table_id, - SubnetId=self.priv_vpc_subnet_id_2 + RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_2 ) logger.info("Route table associated with subnet.") @@ -430,41 +393,37 @@ def create_vpc(self): while True: try: self.ec2.create_route( - DestinationCidrBlock='0.0.0.0/0', + DestinationCidrBlock="0.0.0.0/0", NatGatewayId=self.nat_gateway_id, - RouteTableId=self.priv_route_table_id + RouteTableId=self.priv_route_table_id, ) logger.info("Route created for subnet.") break except Exception as e: - if 'InvalidNatGatewayID.NotFound' in str(e): + if "InvalidNatGatewayID.NotFound" in str(e): time.sleep(5) logger.info("Nat Gateway not yet created. Sleeping...") else: raise - + gateway_response = self.ec2.create_vpc_endpoint( VpcId=self.vpc_id, - ServiceName=f'com.amazonaws.{self.region}.s3', + ServiceName=f"com.amazonaws.{self.region}.s3", RouteTableIds=[self.priv_route_table_id, self.pub_route_table_id], - VpcEndpointType='Gateway', + VpcEndpointType="Gateway", PolicyDocument='{"Statement": [{"Action": "*", "Effect": "Allow", "Resource": "*", "Principal": "*"}]}', ) logger.info("S3 gateway created for VPC.") - self.s3_gateway_endpoint_id = gateway_response['VpcEndpoint']['VpcEndpointId'] + self.s3_gateway_endpoint_id = gateway_response["VpcEndpoint"]["VpcEndpointId"] backoff( self.ec2.create_tags, - Resources=[ - self.s3_gateway_endpoint_id - ], - Tags=self.get_tags_uppercase(Name=self.job_identifier) + Resources=[self.s3_gateway_endpoint_id], + Tags=self.get_tags_uppercase(Name=self.job_identifier), ) - - def generate_name_value_inputs(self, var_dictionary): """ Helper to properly format more easily used dictionaries. @@ -485,7 +444,9 @@ def create_batch_service_roles(self): self.batch_service_role_name, "batch", f"Service role for Batch environment {self.job_identifier}", - managed_policie_arns=['arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole'] + managed_policie_arns=[ + "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole" + ], ) # Instance Role for Batch compute environment @@ -494,7 +455,9 @@ def create_batch_service_roles(self): self.batch_instance_role_name, "ec2", f"Instance role for Batch compute environment {self.job_identifier}", - managed_policie_arns=['arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role'] + managed_policie_arns=[ + "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" + ], ) # Instance Profile @@ -504,26 +467,26 @@ def create_batch_service_roles(self): InstanceProfileName=self.batch_instance_profile_name ) - self.instance_profile_arn = response['InstanceProfile']['Arn'] + self.instance_profile_arn = response["InstanceProfile"]["Arn"] logger.info("Instance Profile created") response = self.iam.add_role_to_instance_profile( InstanceProfileName=self.batch_instance_profile_name, - RoleName=self.batch_instance_role_name + RoleName=self.batch_instance_role_name, ) except Exception as e: - if 'EntityAlreadyExists' in str(e): - logger.info('ECS Instance Profile not created - already exists') + if "EntityAlreadyExists" in str(e): + logger.info("ECS Instance Profile not created - already exists") response = self.iam.get_instance_profile( InstanceProfileName=self.batch_instance_profile_name ) - self.instance_profile_arn = response['InstanceProfile']['Arn'] + self.instance_profile_arn = response["InstanceProfile"]["Arn"] # ECS Task Policy - task_permissions_policy = f'''{{ + task_permissions_policy = f"""{{ "Version": "2012-10-17", "Statement": [ {{ @@ -608,12 +571,14 @@ def create_batch_service_roles(self): "Resource": "*" }} ] - }}''' + }}""" - self.task_role_arn = self.iam_helper.role_stitcher(self.batch_ecs_task_role_name, - "ecs-tasks", - f"Task role for Batch job {self.job_identifier}", - policies_list=[task_permissions_policy]) + self.task_role_arn = self.iam_helper.role_stitcher( + self.batch_ecs_task_role_name, + "ecs-tasks", + f"Task role for Batch job {self.job_identifier}", + policies_list=[task_permissions_policy], + ) if self.batch_use_spot: # Spot Fleet Role @@ -621,7 +586,9 @@ def create_batch_service_roles(self): self.batch_spot_service_role_name, "spotfleet", f"Spot Fleet role for Batch compute environment {self.job_identifier}", - managed_policie_arns=['arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole'] + managed_policie_arns=[ + "arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole" + ], ) def create_compute_environment(self, maxCPUs=10000): @@ -640,17 +607,17 @@ def create_compute_environment(self, maxCPUs=10000): "BlockDeviceMappings": [ { "DeviceName": "/dev/xvda", - "Ebs": { - "VolumeSize": 100, - "VolumeType": "gp2" - } + "Ebs": {"VolumeSize": 100, "VolumeType": "gp2"}, } ] - } + }, ) except ClientError as error: - if error.response['Error']['Code'] == 'InvalidLaunchTemplateName.AlreadyExistsException': - logger.debug('Launch template exists, skipping creation') + if ( + error.response["Error"]["Code"] + == "InvalidLaunchTemplateName.AlreadyExistsException" + ): + logger.debug("Launch template exists, skipping creation") else: raise error @@ -663,60 +630,71 @@ def create_compute_environment(self, maxCPUs=10000): while next_token: lt_resp = self.ec2.describe_launch_templates( LaunchTemplateNames=[self.launch_template_name], - NextToken=next_token + NextToken=next_token, ) launch_templates.extend(lt_resp["LaunchTemplates"]) next_token = lt_resp.get("NextToken") n_launch_templates = len(launch_templates) - assert n_launch_templates <= 1, f"There are {n_launch_templates} launch templates, this shouldn't happen." + assert ( + n_launch_templates <= 1 + ), f"There are {n_launch_templates} launch templates, this shouldn't happen." if n_launch_templates == 0: - logger.debug(f"Waiting for the launch template {self.launch_template_name} to be created") + logger.debug( + f"Waiting for the launch template {self.launch_template_name} to be created" + ) time.sleep(5) if n_launch_templates == 1: break try: compute_resources = { - 'minvCpus': 0, - 'maxvCpus': maxCPUs, - 'desiredvCpus': 0, - 'instanceTypes': [ - 'optimal', + "minvCpus": 0, + "maxvCpus": maxCPUs, + "desiredvCpus": 0, + "instanceTypes": [ + "optimal", ], - 'launchTemplate': { - 'launchTemplateName': self.launch_template_name, + "launchTemplate": { + "launchTemplateName": self.launch_template_name, }, - 'subnets': [self.priv_vpc_subnet_id_1, self.priv_vpc_subnet_id_2], - 'securityGroupIds': [self.batch_security_group], - 'instanceRole': self.instance_profile_arn + "subnets": [self.priv_vpc_subnet_id_1, self.priv_vpc_subnet_id_2], + "securityGroupIds": [self.batch_security_group], + "instanceRole": self.instance_profile_arn, } if self.batch_use_spot: - compute_resources.update({ - 'type': 'SPOT', - 'bidPercentage': 100, - 'spotIamFleetRole': self.spot_service_role_arn - }) + compute_resources.update( + { + "type": "SPOT", + "bidPercentage": 100, + "spotIamFleetRole": self.spot_service_role_arn, + } + ) else: - compute_resources['type'] = 'EC2' + compute_resources["type"] = "EC2" - compute_resources['tags'] = self.get_tags(Name=f"{self.job_identifier} batch instance") + compute_resources["tags"] = self.get_tags( + Name=f"{self.job_identifier} batch instance" + ) self.batch.create_compute_environment( computeEnvironmentName=self.batch_compute_environment_name, - type='MANAGED', - state='ENABLED', + type="MANAGED", + state="ENABLED", computeResources=compute_resources, serviceRole=self.service_role_arn, - tags=self.get_tags() + tags=self.get_tags(), ) - logger.info(f'Compute environment {self.batch_compute_environment_name} created.') + logger.info( + f"Compute environment {self.batch_compute_environment_name} created." + ) except Exception as e: - if 'Object already exists' in str(e): + if "Object already exists" in str(e): logger.info( - f'Compute environment {self.batch_compute_environment_name} not created - already exists') + f"Compute environment {self.batch_compute_environment_name} not created - already exists" + ) else: raise @@ -729,38 +707,42 @@ def create_job_queue(self): try: response = self.batch.create_job_queue( jobQueueName=self.batch_job_queue_name, - state='ENABLED', + state="ENABLED", priority=1, computeEnvironmentOrder=[ { - 'order': 1, - 'computeEnvironment': self.batch_compute_environment_name + "order": 1, + "computeEnvironment": self.batch_compute_environment_name, }, ], - tags=self.get_tags() + tags=self.get_tags(), ) # print("JOB QUEUE") # print(response) - self.job_queue_arn = response['jobQueueArn'] - logger.info(f'Job queue {self.batch_job_queue_name} created') + self.job_queue_arn = response["jobQueueArn"] + logger.info(f"Job queue {self.batch_job_queue_name} created") break except Exception as e: - if 'Object already exists' in str(e): - logger.info(f'Job queue {self.batch_job_queue_name} not created - already exists') + if "Object already exists" in str(e): + logger.info( + f"Job queue {self.batch_job_queue_name} not created - already exists" + ) response = self.batch.describe_job_queues( jobQueues=[ self.batch_job_queue_name, ] ) - self.job_queue_arn = response['jobQueues'][0]['jobQueueArn'] + self.job_queue_arn = response["jobQueues"][0]["jobQueueArn"] break - elif 'is not valid' in str(e): + elif "is not valid" in str(e): # Need to wait a second for the compute environment to complete registration logger.warning( - 'wating a few seconds for compute environment creation: ' + str(e)) + "wating a few seconds for compute environment creation: " + + str(e) + ) time.sleep(5) else: @@ -777,25 +759,23 @@ def create_job_definition(self, docker_image, vcpus, memory, command, env_vars): """ response = self.batch.register_job_definition( jobDefinitionName=self.job_identifier, - type='container', + type="container", # parameters={ # 'string': 'string' # }, containerProperties={ - 'image': docker_image, - 'vcpus': vcpus, - 'memory': memory, - 'command': command, - 'jobRoleArn': self.task_role_arn, - 'environment': self.generate_name_value_inputs(env_vars) - }, - retryStrategy={ - 'attempts': 2 + "image": docker_image, + "vcpus": vcpus, + "memory": memory, + "command": command, + "jobRoleArn": self.task_role_arn, + "environment": self.generate_name_value_inputs(env_vars), }, - tags=self.get_tags() + retryStrategy={"attempts": 2}, + tags=self.get_tags(), ) - self.job_definition_arn = response['jobDefinitionArn'] + self.job_definition_arn = response["jobDefinitionArn"] def submit_job(self, array_size=4): """ @@ -807,11 +787,9 @@ def submit_job(self, array_size=4): resp = self.batch.submit_job( jobName=self.job_identifier, jobQueue=self.batch_job_queue_name, - arrayProperties={ - 'size': array_size - }, + arrayProperties={"size": array_size}, jobDefinition=self.job_definition_arn, - tags=self.get_tags() + tags=self.get_tags(), ) logger.info(f"Job {self.job_identifier} submitted.") @@ -819,9 +797,12 @@ def submit_job(self, array_size=4): except Exception as e: - if 'not in VALID state' in str(e): + if "not in VALID state" in str(e): # Need to wait a second for the compute environment to complete registration - logger.warning('5 second sleep initiated to wait for job queue creation due to error: ' + str(e)) + logger.warning( + "5 second sleep initiated to wait for job queue creation due to error: " + + str(e) + ) time.sleep(5) else: raise @@ -833,42 +814,43 @@ def clean(self): response = self.ec2.describe_vpcs( Filters=[ { - 'Name': 'tag:Name', - 'Values': [ + "Name": "tag:Name", + "Values": [ self.vpc_name, - ] + ], }, ] ) try: - self.vpc_id = response['Vpcs'][0]['VpcId'] + self.vpc_id = response["Vpcs"][0]["VpcId"] except (KeyError, IndexError): self.vpc_id = None default_sg_response = self.ec2.describe_security_groups( Filters=[ { - 'Name': 'group-name', - 'Values': [ - 'default', - ] + "Name": "group-name", + "Values": [ + "default", + ], }, ] ) logger.info("Removing egress from default security group.") - for group in default_sg_response['SecurityGroups']: - if group['VpcId'] == self.vpc_id: - default_group_id = group['GroupId'] + for group in default_sg_response["SecurityGroups"]: + if group["VpcId"] == self.vpc_id: + default_group_id = group["GroupId"] dsg = self.ec2r.SecurityGroup(default_group_id) if len(dsg.ip_permissions_egress): - response = dsg.revoke_egress(IpPermissions=dsg.ip_permissions_egress) + response = dsg.revoke_egress( + IpPermissions=dsg.ip_permissions_egress + ) try: self.batch.update_job_queue( - jobQueue=self.batch_job_queue_name, - state='DISABLED' + jobQueue=self.batch_job_queue_name, state="DISABLED" ) while True: @@ -879,22 +861,25 @@ def clean(self): logger.info(f"Job queue {self.batch_job_queue_name} deleted.") break except Exception as e: - if 'Cannot delete, resource is being modified' in str(e): - logger.info("Job queue being modified - sleeping until ready...") + if "Cannot delete, resource is being modified" in str(e): + logger.info( + "Job queue being modified - sleeping until ready..." + ) time.sleep(5) else: raise except Exception as e: - if 'does not exist' in str(e): - logger.info(f"Job queue {self.batch_job_queue_name} missing, skipping...") + if "does not exist" in str(e): + logger.info( + f"Job queue {self.batch_job_queue_name} missing, skipping..." + ) # Delete compute enviornment try: self.batch.update_compute_environment( - computeEnvironment=self.batch_compute_environment_name, - state='DISABLED' + computeEnvironment=self.batch_compute_environment_name, state="DISABLED" ) while True: @@ -902,17 +887,25 @@ def clean(self): response = self.batch.delete_compute_environment( computeEnvironment=self.batch_compute_environment_name ) - logger.info(f"Compute environment {self.batch_compute_environment_name} deleted.") + logger.info( + f"Compute environment {self.batch_compute_environment_name} deleted." + ) break except Exception as e: - if 'Cannot delete, resource is being modified' in str(e) or 'found existing JobQueue' in str(e): - logger.info("Compute environment being modified - sleeping until ready...") + if "Cannot delete, resource is being modified" in str( + e + ) or "found existing JobQueue" in str(e): + logger.info( + "Compute environment being modified - sleeping until ready..." + ) time.sleep(5) else: raise except Exception as e: - if 'does not exist' in str(e): - logger.info(f"Compute environment {self.batch_compute_environment_name} missing, skipping...") + if "does not exist" in str(e): + logger.info( + f"Compute environment {self.batch_compute_environment_name} missing, skipping..." + ) else: raise @@ -922,8 +915,10 @@ def clean(self): LaunchTemplateName=self.launch_template_name ) except Exception as e: - if 'does not exist' in str(e): - logger.info(f"Launch template {self.launch_template_name} does not exist, skipping...") + if "does not exist" in str(e): + logger.info( + f"Launch template {self.launch_template_name} does not exist, skipping..." + ) else: raise @@ -931,7 +926,9 @@ def clean(self): self.iam_helper.delete_role(self.batch_spot_service_role_name) self.iam_helper.delete_role(self.batch_ecs_task_role_name) # Instance profile order of removal - self.iam_helper.remove_role_from_instance_profile(self.batch_instance_profile_name) + self.iam_helper.remove_role_from_instance_profile( + self.batch_instance_profile_name + ) self.iam_helper.delete_role(self.batch_instance_role_name) self.iam_helper.delete_instance_profile(self.batch_instance_profile_name) @@ -940,73 +937,47 @@ def clean(self): response = self.ec2.describe_vpcs( Filters=[ { - 'Name': 'tag:Name', - 'Values': [ + "Name": "tag:Name", + "Values": [ self.job_identifier, - ] + ], }, ], ) - for vpc in response['Vpcs']: - this_vpc = vpc['VpcId'] + for vpc in response["Vpcs"]: + this_vpc = vpc["VpcId"] s3gw_response = self.ec2.describe_vpc_endpoints( - Filters=[ - { - 'Name': 'vpc-id', - 'Values': [ - this_vpc - ] - } - ] + Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] ) - - for s3gw in s3gw_response['VpcEndpoints']: - this_s3gw = s3gw['VpcEndpointId'] - if s3gw['State'] != 'deleted': - self.ec2.delete_vpc_endpoints( - VpcEndpointIds=[this_s3gw] - ) + for s3gw in s3gw_response["VpcEndpoints"]: + this_s3gw = s3gw["VpcEndpointId"] + + if s3gw["State"] != "deleted": + self.ec2.delete_vpc_endpoints(VpcEndpointIds=[this_s3gw]) ng_response = self.ec2.describe_nat_gateways( - Filters=[ - { - 'Name': 'vpc-id', - 'Values': [ - this_vpc - ] - } - ] + Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] ) - for natgw in ng_response['NatGateways']: - this_natgw = natgw['NatGatewayId'] + for natgw in ng_response["NatGateways"]: + this_natgw = natgw["NatGatewayId"] - if natgw['State'] != 'deleted': - self.ec2.delete_nat_gateway( - NatGatewayId=this_natgw - ) + if natgw["State"] != "deleted": + self.ec2.delete_nat_gateway(NatGatewayId=this_natgw) rtas_response = self.ec2.describe_route_tables( - Filters=[ - { - 'Name': 'vpc-id', - 'Values': [ - this_vpc - ] - } - ] - + Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] ) - for route_table in rtas_response['RouteTables']: - route_table_id = route_table['RouteTableId'] - for association in route_table['Associations']: - if not association['Main']: + for route_table in rtas_response["RouteTables"]: + route_table_id = route_table["RouteTableId"] + for association in route_table["Associations"]: + if not association["Main"]: response = self.ec2.disassociate_route_table( - AssociationId=association['RouteTableAssociationId'] + AssociationId=association["RouteTableAssociationId"] ) rt_counter = 10 while rt_counter: @@ -1018,104 +989,92 @@ def clean(self): break except Exception as e: rt_counter = rt_counter - 1 - if 'DependencyViolation' in str(e): - logger.info("Waiting for association to be released before deleting route table. Sleeping...") # noqa E501 + if "DependencyViolation" in str(e): + logger.info( + "Waiting for association to be released before deleting route table. Sleeping..." + ) # noqa E501 time.sleep(5) else: raise igw_response = self.ec2.describe_internet_gateways( - Filters=[ - { - 'Name': 'tag:Name', - 'Values': [ - self.job_identifier - ] - } - ] + Filters=[{"Name": "tag:Name", "Values": [self.job_identifier]}] ) - for internet_gateway in igw_response['InternetGateways']: - for attachment in internet_gateway['Attachments']: - if attachment['VpcId'] == this_vpc: + for internet_gateway in igw_response["InternetGateways"]: + for attachment in internet_gateway["Attachments"]: + if attachment["VpcId"] == this_vpc: while True: try: try: self.ec2.detach_internet_gateway( - InternetGatewayId=internet_gateway['InternetGatewayId'], - VpcId=attachment['VpcId'] + InternetGatewayId=internet_gateway[ + "InternetGatewayId" + ], + VpcId=attachment["VpcId"], ) except Exception as e: - logger.info(f"Error on Internet Gateway disassociation - ignoring... {str(e)}") + logger.info( + f"Error on Internet Gateway disassociation - ignoring... {str(e)}" + ) self.ec2.delete_internet_gateway( - InternetGatewayId=internet_gateway['InternetGatewayId'] + InternetGatewayId=internet_gateway[ + "InternetGatewayId" + ] ) logger.info("Internet Gateway deleted.") break except Exception as e: - if 'DependencyViolation' in str(e): + if "DependencyViolation" in str(e): logger.info( - "Waiting for IPs to be released before deleting Internet Gateway. Sleeping...") + "Waiting for IPs to be released before deleting Internet Gateway. Sleeping..." + ) time.sleep(5) else: raise subn_response = self.ec2.describe_subnets( - Filters=[ - { - 'Name': 'vpc-id', - 'Values': [ - this_vpc - ] - } - ] + Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] ) - for subnet in subn_response['Subnets']: + for subnet in subn_response["Subnets"]: while True: try: - self.ec2.delete_subnet( - SubnetId=subnet['SubnetId'] - ) + self.ec2.delete_subnet(SubnetId=subnet["SubnetId"]) break except Exception as e: - if 'DependencyViolation' in str(e): - logger.info('Subnet cannot be deleted as dependencies are still being deleted. Sleeping...') + if "DependencyViolation" in str(e): + logger.info( + "Subnet cannot be deleted as dependencies are still being deleted. Sleeping..." + ) time.sleep(10) else: raise - self.ec2.delete_vpc( - VpcId=this_vpc - ) + self.ec2.delete_vpc(VpcId=this_vpc) # Find the Elastic IP from the NAT response = self.ec2.describe_addresses( Filters=[ { - 'Name': 'tag:Name', - 'Values': [ + "Name": "tag:Name", + "Values": [ self.job_identifier, - ] + ], }, ], - ) - for address in response['Addresses']: - this_address = address['AllocationId'] + for address in response["Addresses"]: + this_address = address["AllocationId"] - response = self.ec2.release_address( - AllocationId=this_address - ) + response = self.ec2.release_address(AllocationId=this_address) try: - self.ec2.delete_security_group( - GroupName=f"dask-{self.job_identifier}" - ) + self.ec2.delete_security_group(GroupName=f"dask-{self.job_identifier}") except ClientError as error: - if error.response['Error']['Code'] == 'InvalidGroup.NotFound': + if error.response["Error"]["Code"] == "InvalidGroup.NotFound": pass else: raise error @@ -1132,8 +1091,12 @@ def __init__(self, project_filename): try: self.docker_client.ping() except: # noqa: E722 (allow bare except in this case because error can be a weird non-class Windows API error) - logger.error('The docker server did not respond, make sure Docker Desktop is started then retry.') - raise RuntimeError('The docker server did not respond, make sure Docker Desktop is started then retry.') + logger.error( + "The docker server did not respond, make sure Docker Desktop is started then retry." + ) + raise RuntimeError( + "The docker server did not respond, make sure Docker Desktop is started then retry." + ) @staticmethod def validate_project(project_file): @@ -1141,7 +1104,7 @@ def validate_project(project_file): @property def docker_image(self): - return 'nrel/openstudio:{}'.format(self.os_version) + return "nrel/openstudio:{}".format(self.os_version) class AwsBatch(DockerBatchBase): @@ -1149,23 +1112,29 @@ class AwsBatch(DockerBatchBase): def __init__(self, project_filename): super().__init__(project_filename) - self.job_identifier = re.sub('[^0-9a-zA-Z]+', '_', self.cfg['aws']['job_identifier'])[:10] + self.job_identifier = re.sub( + "[^0-9a-zA-Z]+", "_", self.cfg["aws"]["job_identifier"] + )[:10] self.project_filename = project_filename - self.region = self.cfg['aws']['region'] - self.ecr = boto3.client('ecr', region_name=self.region, config=boto_client_config) - self.s3 = boto3.client('s3', region_name=self.region, config=boto_client_config) - self.s3_bucket = self.cfg['aws']['s3']['bucket'] - self.s3_bucket_prefix = self.cfg['aws']['s3']['prefix'].rstrip('/') - self.batch_env_use_spot = self.cfg['aws']['use_spot'] - self.batch_array_size = self.cfg['aws']['batch_array_size'] + self.region = self.cfg["aws"]["region"] + self.ecr = boto3.client( + "ecr", region_name=self.region, config=boto_client_config + ) + self.s3 = boto3.client("s3", region_name=self.region, config=boto_client_config) + self.s3_bucket = self.cfg["aws"]["s3"]["bucket"] + self.s3_bucket_prefix = self.cfg["aws"]["s3"]["prefix"].rstrip("/") + self.batch_env_use_spot = self.cfg["aws"]["use_spot"] + self.batch_array_size = self.cfg["aws"]["batch_array_size"] self.boto3_session = boto3.Session(region_name=self.region) @staticmethod def validate_dask_settings(project_file): cfg = get_project_configuration(project_file) if "emr" in cfg["aws"]: - logger.warning("The `aws.emr` configuration is no longer used and is ignored. Recommend removing.") + logger.warning( + "The `aws.emr` configuration is no longer used and is ignored. Recommend removing." + ) dask_cfg = cfg["aws"]["dask"] errors = [] mem_rules = { @@ -1182,14 +1151,18 @@ def validate_dask_settings(project_file): f"`aws.dask.{node_type}_memory` = {mem}, needs to be a multiple of 1024." ) mem_gb = mem // 1024 - min_gb, max_gb, incr_gb = mem_rules[dask_cfg.get(f"{node_type}_cpu", 2 * 1024)] + min_gb, max_gb, incr_gb = mem_rules[ + dask_cfg.get(f"{node_type}_cpu", 2 * 1024) + ] if not (min_gb <= mem_gb <= max_gb and (mem_gb - min_gb) % incr_gb == 0): errors.append( f"`aws.dask.{node_type}_memory` = {mem}, " f"should be between {min_gb * 1024} and {max_gb * 1024} in a multiple of {incr_gb * 1024}." ) if errors: - errors.append("See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html") + errors.append( + "See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html" + ) raise ValidationError("\n".join(errors)) return True @@ -1201,7 +1174,7 @@ def validate_project(project_file): @property def docker_image(self): - return 'nrel/buildstockbatch' + return "nrel/buildstockbatch" @property def weather_dir(self): @@ -1209,23 +1182,23 @@ def weather_dir(self): @property def results_dir(self): - return f'{self.s3_bucket}/{self.s3_bucket_prefix}/results' + return f"{self.s3_bucket}/{self.s3_bucket_prefix}/results" @property def output_dir(self): - return f'{self.s3_bucket}/{self.s3_bucket_prefix}' + return f"{self.s3_bucket}/{self.s3_bucket_prefix}" @property def container_repo(self): repo_name = self.docker_image repos = self.ecr.describe_repositories() repo = None - for repo in repos['repositories']: - if repo['repositoryName'] == repo_name: + for repo in repos["repositories"]: + if repo["repositoryName"] == repo_name: break if repo is None: resp = self.ecr.create_repository(repositoryName=repo_name) - repo = resp['repository'] + repo = resp["repository"] return repo @property @@ -1237,63 +1210,72 @@ def build_image(self): Build the docker image to use in the batch simulation """ root_path = pathlib.Path(os.path.abspath(__file__)).parent.parent.parent - if not (root_path / 'Dockerfile').exists(): - raise RuntimeError(f'The needs to be run from the root of the repo, found {root_path}') + if not (root_path / "Dockerfile").exists(): + raise RuntimeError( + f"The needs to be run from the root of the repo, found {root_path}" + ) # Make the buildstock/resources/.aws_docker_image dir to store logs - local_log_dir = os.path.join(self.buildstock_dir, 'resources', '.aws_docker_image') + local_log_dir = os.path.join( + self.buildstock_dir, "resources", ".aws_docker_image" + ) if not os.path.exists(local_log_dir): os.makedirs(local_log_dir) # Determine whether or not to build the image with custom gems bundled in - if self.cfg.get('baseline', dict()).get('custom_gems', False): + if self.cfg.get("baseline", dict()).get("custom_gems", False): # Ensure the custom Gemfile exists in the buildstock dir - local_gemfile_path = os.path.join(self.buildstock_dir, 'resources', 'Gemfile') + local_gemfile_path = os.path.join( + self.buildstock_dir, "resources", "Gemfile" + ) if not os.path.exists(local_gemfile_path): - raise AttributeError(f'baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}') + raise AttributeError( + f"baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}" + ) # Copy the custom Gemfile into the buildstockbatch repo - bsb_root = os.path.join(os.path.abspath(__file__), os.pardir, os.pardir, os.pardir) - new_gemfile_path = os.path.join(bsb_root, 'Gemfile') + bsb_root = os.path.join( + os.path.abspath(__file__), os.pardir, os.pardir, os.pardir + ) + new_gemfile_path = os.path.join(bsb_root, "Gemfile") shutil.copyfile(local_gemfile_path, new_gemfile_path) - logger.info(f'Copying custom Gemfile from {local_gemfile_path}') + logger.info(f"Copying custom Gemfile from {local_gemfile_path}") # Choose the custom-gems stage in the Dockerfile, # which runs bundle install to build custom gems into the image - stage = 'buildstockbatch-custom-gems' + stage = "buildstockbatch-custom-gems" else: # Choose the base stage in the Dockerfile, # which stops before bundling custom gems into the image - stage = 'buildstockbatch' + stage = "buildstockbatch" - logger.info(f'Building docker image stage: {stage} from OpenStudio {self.os_version}') + logger.info( + f"Building docker image stage: {stage} from OpenStudio {self.os_version}" + ) img, build_logs = self.docker_client.images.build( path=str(root_path), tag=self.docker_image, rm=True, target=stage, platform="linux/amd64", - buildargs={'OS_VER': self.os_version} + buildargs={"OS_VER": self.os_version}, ) - build_image_log = os.path.join(local_log_dir, 'build_image.log') - with open(build_image_log, 'w') as f_out: - f_out.write('Built image') + build_image_log = os.path.join(local_log_dir, "build_image.log") + with open(build_image_log, "w") as f_out: + f_out.write("Built image") for line in build_logs: for itm_type, item_msg in line.items(): - if itm_type in ['stream', 'status']: + if itm_type in ["stream", "status"]: try: - f_out.write(f'{item_msg}') + f_out.write(f"{item_msg}") except UnicodeEncodeError: pass - logger.debug(f'Review docker image build log: {build_image_log}') + logger.debug(f"Review docker image build log: {build_image_log}") # Report and confirm the openstudio version from the image - os_ver_cmd = 'openstudio openstudio_version' + os_ver_cmd = "openstudio openstudio_version" container_output = self.docker_client.containers.run( - self.docker_image, - os_ver_cmd, - remove=True, - name='list_openstudio_version' + self.docker_image, os_ver_cmd, remove=True, name="list_openstudio_version" ) assert self.os_version in container_output.decode() @@ -1301,48 +1283,50 @@ def build_image(self): # The OpenStudio Docker image installs the default gems # to /var/oscli/gems, and the custom docker image # overwrites these with the custom gems. - list_gems_cmd = 'openstudio --bundle /var/oscli/Gemfile --bundle_path /var/oscli/gems ' \ - '--bundle_without native_ext gem_list' + list_gems_cmd = ( + "openstudio --bundle /var/oscli/Gemfile --bundle_path /var/oscli/gems " + "--bundle_without native_ext gem_list" + ) container_output = self.docker_client.containers.run( - self.docker_image, - list_gems_cmd, - remove=True, - name='list_gems' + self.docker_image, list_gems_cmd, remove=True, name="list_gems" ) - gem_list_log = os.path.join(local_log_dir, 'openstudio_gem_list_output.log') - with open(gem_list_log, 'wb') as f_out: + gem_list_log = os.path.join(local_log_dir, "openstudio_gem_list_output.log") + with open(gem_list_log, "wb") as f_out: f_out.write(container_output) - for line in container_output.decode().split('\n'): + for line in container_output.decode().split("\n"): logger.debug(line) - logger.debug(f'Review custom gems list at: {gem_list_log}') + logger.debug(f"Review custom gems list at: {gem_list_log}") def push_image(self): """ Push the locally built docker image to the AWS docker repo """ auth_token = self.ecr.get_authorization_token() - dkr_user, dkr_pass = base64.b64decode(auth_token['authorizationData'][0]['authorizationToken']). \ - decode('ascii').split(':') - repo_url = self.container_repo['repositoryUri'] - registry_url = 'https://' + repo_url.split('/')[0] + dkr_user, dkr_pass = ( + base64.b64decode(auth_token["authorizationData"][0]["authorizationToken"]) + .decode("ascii") + .split(":") + ) + repo_url = self.container_repo["repositoryUri"] + registry_url = "https://" + repo_url.split("/")[0] resp = self.docker_client.login( - username=dkr_user, - password=dkr_pass, - registry=registry_url + username=dkr_user, password=dkr_pass, registry=registry_url ) logger.debug(resp) image = self.docker_client.images.get(self.docker_image) image.tag(repo_url, tag=self.job_identifier) last_status = None - for x in self.docker_client.images.push(repo_url, tag=self.job_identifier, stream=True): + for x in self.docker_client.images.push( + repo_url, tag=self.job_identifier, stream=True + ): try: y = json.loads(x) except json.JSONDecodeError: continue else: - if y.get('status') is not None and y.get('status') != last_status: - logger.debug(y['status']) - last_status = y['status'] + if y.get("status") is not None and y.get("status") != last_status: + logger.debug(y["status"]) + last_status = y["status"] def clean(self): """ @@ -1351,7 +1335,9 @@ def clean(self): """ logger.info("Beginning cleanup of AWS resources...") - batch_env = AwsBatchEnv(self.job_identifier, self.cfg['aws'], self.boto3_session) + batch_env = AwsBatchEnv( + self.job_identifier, self.cfg["aws"], self.boto3_session + ) batch_env.clean() def run_batch(self): @@ -1368,29 +1354,43 @@ def run_batch(self): buildstock_csv_filename = self.sampler.run_sampling() # Compress and upload assets to S3 - with tempfile.TemporaryDirectory(prefix='bsb_') as tmpdir, tempfile.TemporaryDirectory(prefix='bsb_') as tmp_weather_dir: # noqa: E501 + with tempfile.TemporaryDirectory( + prefix="bsb_" + ) as tmpdir, tempfile.TemporaryDirectory( + prefix="bsb_" + ) as tmp_weather_dir: # noqa: E501 self._weather_dir = tmp_weather_dir self._get_weather_files() tmppath = pathlib.Path(tmpdir) - logger.debug('Creating assets tarfile') - with tarfile.open(tmppath / 'assets.tar.gz', 'x:gz') as tar_f: + logger.debug("Creating assets tarfile") + with tarfile.open(tmppath / "assets.tar.gz", "x:gz") as tar_f: project_path = pathlib.Path(self.project_dir) buildstock_path = pathlib.Path(self.buildstock_dir) - tar_f.add(buildstock_path / 'measures', 'measures') - if os.path.exists(buildstock_path / 'resources/hpxml-measures'): - tar_f.add(buildstock_path / 'resources/hpxml-measures', 'resources/hpxml-measures') - tar_f.add(buildstock_path / 'resources', 'lib/resources') - tar_f.add(project_path / 'housing_characteristics', 'lib/housing_characteristics') + tar_f.add(buildstock_path / "measures", "measures") + if os.path.exists(buildstock_path / "resources/hpxml-measures"): + tar_f.add( + buildstock_path / "resources/hpxml-measures", + "resources/hpxml-measures", + ) + tar_f.add(buildstock_path / "resources", "lib/resources") + tar_f.add( + project_path / "housing_characteristics", + "lib/housing_characteristics", + ) # Weather files - weather_path = tmppath / 'weather' + weather_path = tmppath / "weather" os.makedirs(weather_path) # Determine the unique weather files - epw_filenames = list(filter(lambda x: x.endswith('.epw'), os.listdir(self.weather_dir))) - logger.debug('Calculating hashes for weather files') + epw_filenames = list( + filter(lambda x: x.endswith(".epw"), os.listdir(self.weather_dir)) + ) + logger.debug("Calculating hashes for weather files") epw_hashes = Parallel(n_jobs=-1, verbose=9)( - delayed(calc_hash_for_file)(pathlib.Path(self.weather_dir) / epw_filename) + delayed(calc_hash_for_file)( + pathlib.Path(self.weather_dir) / epw_filename + ) for epw_filename in epw_filenames ) unique_epws = collections.defaultdict(list) @@ -1398,17 +1398,17 @@ def run_batch(self): unique_epws[epw_hash].append(epw_filename) # Compress unique weather files - logger.debug('Compressing weather files') + logger.debug("Compressing weather files") Parallel(n_jobs=-1, verbose=9)( delayed(compress_file)( pathlib.Path(self.weather_dir) / x[0], - str(weather_path / x[0]) + '.gz' + str(weather_path / x[0]) + ".gz", ) for x in unique_epws.values() ) - logger.debug('Writing project configuration for upload') - with open(tmppath / 'config.json', 'wt', encoding='utf-8') as f: + logger.debug("Writing project configuration for upload") + with open(tmppath / "config.json", "wt", encoding="utf-8") as f: json.dump(self.cfg, f) # Collect simulations to queue @@ -1416,74 +1416,95 @@ def run_batch(self): self.validate_buildstock_csv(self.project_filename, df) building_ids = df.index.tolist() n_datapoints = len(building_ids) - n_sims = n_datapoints * (len(self.cfg.get('upgrades', [])) + 1) - logger.debug('Total number of simulations = {}'.format(n_sims)) + n_sims = n_datapoints * (len(self.cfg.get("upgrades", [])) + 1) + logger.debug("Total number of simulations = {}".format(n_sims)) n_sims_per_job = math.ceil(n_sims / self.batch_array_size) n_sims_per_job = max(n_sims_per_job, 2) - logger.debug('Number of simulations per array job = {}'.format(n_sims_per_job)) + logger.debug( + "Number of simulations per array job = {}".format(n_sims_per_job) + ) baseline_sims = zip(building_ids, itertools.repeat(None)) - upgrade_sims = itertools.product(building_ids, range(len(self.cfg.get('upgrades', [])))) + upgrade_sims = itertools.product( + building_ids, range(len(self.cfg.get("upgrades", []))) + ) all_sims = list(itertools.chain(baseline_sims, upgrade_sims)) random.shuffle(all_sims) all_sims_iter = iter(all_sims) - os.makedirs(tmppath / 'jobs') + os.makedirs(tmppath / "jobs") - logger.info('Queueing jobs') + logger.info("Queueing jobs") for i in itertools.count(0): batch = list(itertools.islice(all_sims_iter, n_sims_per_job)) if not batch: break - job_json_filename = tmppath / 'jobs' / 'job{:05d}.json'.format(i) - with open(job_json_filename, 'w') as f: - json.dump({ - 'job_num': i, - 'n_datapoints': n_datapoints, - 'batch': batch, - }, f, indent=4) + job_json_filename = tmppath / "jobs" / "job{:05d}.json".format(i) + with open(job_json_filename, "w") as f: + json.dump( + { + "job_num": i, + "n_datapoints": n_datapoints, + "batch": batch, + }, + f, + indent=4, + ) array_size = i - logger.debug('Array size = {}'.format(array_size)) + logger.debug("Array size = {}".format(array_size)) # Compress job jsons - jobs_dir = tmppath / 'jobs' - logger.debug('Compressing job jsons using gz') + jobs_dir = tmppath / "jobs" + logger.debug("Compressing job jsons using gz") tick = time.time() - with tarfile.open(tmppath / 'jobs.tar.gz', 'w:gz') as tf: - tf.add(jobs_dir, arcname='jobs') + with tarfile.open(tmppath / "jobs.tar.gz", "w:gz") as tf: + tf.add(jobs_dir, arcname="jobs") tick = time.time() - tick - logger.debug('Done compressing job jsons using gz {:.1f} seconds'.format(tick)) + logger.debug( + "Done compressing job jsons using gz {:.1f} seconds".format(tick) + ) shutil.rmtree(jobs_dir) - os.makedirs(tmppath / 'results' / 'simulation_output') + os.makedirs(tmppath / "results" / "simulation_output") - logger.debug('Uploading files to S3') - upload_directory_to_s3(tmppath, self.cfg['aws']['s3']['bucket'], self.cfg['aws']['s3']['prefix']) + logger.debug("Uploading files to S3") + upload_directory_to_s3( + tmppath, + self.cfg["aws"]["s3"]["bucket"], + self.cfg["aws"]["s3"]["prefix"], + ) # Copy the non-unique weather files on S3 epws_to_copy = [] for epws in unique_epws.values(): # The first in the list is already up there, copy the rest for filename in epws[1:]: - epws_to_copy.append(( - f"{self.cfg['aws']['s3']['prefix']}/weather/{epws[0]}.gz", - f"{self.cfg['aws']['s3']['prefix']}/weather/{filename}.gz" - )) + epws_to_copy.append( + ( + f"{self.cfg['aws']['s3']['prefix']}/weather/{epws[0]}.gz", + f"{self.cfg['aws']['s3']['prefix']}/weather/{filename}.gz", + ) + ) - logger.debug('Copying weather files on S3') - bucket = self.cfg['aws']['s3']['bucket'] + logger.debug("Copying weather files on S3") + bucket = self.cfg["aws"]["s3"]["bucket"] Parallel(n_jobs=-1, verbose=9)( - delayed(copy_s3_file)(bucket, src, bucket, dest) for src, dest in epws_to_copy + delayed(copy_s3_file)(bucket, src, bucket, dest) + for src, dest in epws_to_copy ) # Create the output directories fs = S3FileSystem() - for upgrade_id in range(len(self.cfg.get('upgrades', [])) + 1): - fs.makedirs(f"{self.cfg['aws']['s3']['bucket']}/{self.cfg['aws']['s3']['prefix']}/results/simulation_output/timeseries/up{upgrade_id:02d}") # noqa E501 + for upgrade_id in range(len(self.cfg.get("upgrades", [])) + 1): + fs.makedirs( + f"{self.cfg['aws']['s3']['bucket']}/{self.cfg['aws']['s3']['prefix']}/results/simulation_output/timeseries/up{upgrade_id:02d}" + ) # noqa E501 # Define the batch environment - batch_env = AwsBatchEnv(self.job_identifier, self.cfg['aws'], self.boto3_session) + batch_env = AwsBatchEnv( + self.job_identifier, self.cfg["aws"], self.boto3_session + ) logger.info( "Launching Batch environment - (resource configs will not be updated on subsequent executions, but new job revisions will be created):" # noqa 501 ) @@ -1494,16 +1515,20 @@ def run_batch(self): batch_env.create_job_queue() # Pass through config for the Docker containers - env_vars = dict(S3_BUCKET=self.s3_bucket, S3_PREFIX=self.s3_bucket_prefix, JOB_NAME=self.job_identifier, - REGION=self.region) + env_vars = dict( + S3_BUCKET=self.s3_bucket, + S3_PREFIX=self.s3_bucket_prefix, + JOB_NAME=self.job_identifier, + REGION=self.region, + ) - job_env_cfg = self.cfg['aws'].get('job_environment', {}) + job_env_cfg = self.cfg["aws"].get("job_environment", {}) batch_env.create_job_definition( self.image_url, - command=['python3', '-m', 'buildstockbatch.aws.aws'], - vcpus=job_env_cfg.get('vcpus', 1), - memory=job_env_cfg.get('memory', 1024), - env_vars=env_vars + command=["python3", "-m", "buildstockbatch.aws.aws"], + vcpus=job_env_cfg.get("vcpus", 1), + memory=job_env_cfg.get("memory", 1024), + env_vars=env_vars, ) # start job @@ -1513,17 +1538,21 @@ def run_batch(self): n_succeeded_last_time = 0 with tqdm.tqdm(desc="Running Simulations", total=array_size) as progress_bar: job_status = None - while job_status not in ('SUCCEEDED', 'FAILED'): + while job_status not in ("SUCCEEDED", "FAILED"): time.sleep(10) - job_desc_resp = batch_env.batch.describe_jobs(jobs=[job_info['jobId']]) - job_status = job_desc_resp['jobs'][0]['status'] + job_desc_resp = batch_env.batch.describe_jobs(jobs=[job_info["jobId"]]) + job_status = job_desc_resp["jobs"][0]["status"] - jobs_resp = batch_env.batch.list_jobs(arrayJobId=job_info['jobId'], jobStatus='SUCCEEDED') + jobs_resp = batch_env.batch.list_jobs( + arrayJobId=job_info["jobId"], jobStatus="SUCCEEDED" + ) n_succeeded = len(jobs_resp["jobSummaryList"]) next_token = jobs_resp.get("nextToken") while next_token is not None: jobs_resp = batch_env.batch.list_jobs( - arrayJobId=job_info['jobId'], jobStatus='SUCCEEDED', nextToken=next_token + arrayJobId=job_info["jobId"], + jobStatus="SUCCEEDED", + nextToken=next_token, ) n_succeeded += len(jobs_resp["jobSummaryList"]) next_token = jobs_resp.get("nextToken") @@ -1545,65 +1574,77 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): """ logger.debug(f"region: {region}") - s3 = boto3.client('s3', config=boto_client_config) + s3 = boto3.client("s3", config=boto_client_config) - sim_dir = pathlib.Path('/var/simdata/openstudio') + sim_dir = pathlib.Path("/var/simdata/openstudio") - logger.debug('Downloading assets') - assets_file_path = sim_dir.parent / 'assets.tar.gz' - s3.download_file(bucket, f'{prefix}/assets.tar.gz', str(assets_file_path)) - with tarfile.open(assets_file_path, 'r') as tar_f: + logger.debug("Downloading assets") + assets_file_path = sim_dir.parent / "assets.tar.gz" + s3.download_file(bucket, f"{prefix}/assets.tar.gz", str(assets_file_path)) + with tarfile.open(assets_file_path, "r") as tar_f: tar_f.extractall(sim_dir) os.remove(assets_file_path) - logger.debug('Reading config') + logger.debug("Reading config") with io.BytesIO() as f: - s3.download_fileobj(bucket, f'{prefix}/config.json', f) + s3.download_fileobj(bucket, f"{prefix}/config.json", f) cfg = json.loads(f.getvalue()) - logger.debug('Getting job information') - jobs_file_path = sim_dir.parent / 'jobs.tar.gz' - s3.download_file(bucket, f'{prefix}/jobs.tar.gz', str(jobs_file_path)) - with tarfile.open(jobs_file_path, 'r') as tar_f: - jobs_d = json.load(tar_f.extractfile(f'jobs/job{job_id:05d}.json'), encoding='utf-8') - logger.debug('Number of simulations = {}'.format(len(jobs_d['batch']))) + logger.debug("Getting job information") + jobs_file_path = sim_dir.parent / "jobs.tar.gz" + s3.download_file(bucket, f"{prefix}/jobs.tar.gz", str(jobs_file_path)) + with tarfile.open(jobs_file_path, "r") as tar_f: + jobs_d = json.load( + tar_f.extractfile(f"jobs/job{job_id:05d}.json"), encoding="utf-8" + ) + logger.debug("Number of simulations = {}".format(len(jobs_d["batch"]))) - logger.debug('Getting weather files') - weather_dir = sim_dir / 'weather' + logger.debug("Getting weather files") + weather_dir = sim_dir / "weather" os.makedirs(weather_dir, exist_ok=True) # Make a lookup of which parameter points to the weather file from options_lookup.tsv - with open(sim_dir / 'lib' / 'resources' / 'options_lookup.tsv', 'r', encoding='utf-8') as f: - tsv_reader = csv.reader(f, delimiter='\t') + with open( + sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8" + ) as f: + tsv_reader = csv.reader(f, delimiter="\t") next(tsv_reader) # skip headers param_name = None epws_by_option = {} for row in tsv_reader: - row_has_epw = [x.endswith('.epw') for x in row[2:]] + row_has_epw = [x.endswith(".epw") for x in row[2:]] if sum(row_has_epw): if row[0] != param_name and param_name is not None: - raise RuntimeError(f'The epw files are specified in options_lookup.tsv under more than one parameter type: {param_name}, {row[0]}') # noqa: E501 - epw_filename = row[row_has_epw.index(True) + 2].split('=')[1].split('/')[-1] + raise RuntimeError( + f"The epw files are specified in options_lookup.tsv under more than one parameter type: {param_name}, {row[0]}" + ) # noqa: E501 + epw_filename = ( + row[row_has_epw.index(True) + 2].split("=")[1].split("/")[-1] + ) param_name = row[0] option_name = row[1] epws_by_option[option_name] = epw_filename # Look through the buildstock.csv to find the appropriate location and epw epws_to_download = set() - building_ids = [x[0] for x in jobs_d['batch']] - with open(sim_dir / 'lib' / 'housing_characteristics' / 'buildstock.csv', 'r', encoding='utf-8') as f: + building_ids = [x[0] for x in jobs_d["batch"]] + with open( + sim_dir / "lib" / "housing_characteristics" / "buildstock.csv", + "r", + encoding="utf-8", + ) as f: csv_reader = csv.DictReader(f) for row in csv_reader: - if int(row['Building']) in building_ids: + if int(row["Building"]) in building_ids: epws_to_download.add(epws_by_option[row[param_name]]) # Download the epws needed for these simulations for epw_filename in epws_to_download: with io.BytesIO() as f_gz: - logger.debug('Downloading {}.gz'.format(epw_filename)) - s3.download_fileobj(bucket, f'{prefix}/weather/{epw_filename}.gz', f_gz) - with open(weather_dir / epw_filename, 'wb') as f_out: - logger.debug('Extracting {}'.format(epw_filename)) + logger.debug("Downloading {}.gz".format(epw_filename)) + s3.download_fileobj(bucket, f"{prefix}/weather/{epw_filename}.gz", f_gz) + with open(weather_dir / epw_filename, "wb") as f_out: + logger.debug("Extracting {}".format(epw_filename)) f_out.write(gzip.decompress(f_gz.getvalue())) asset_dirs = os.listdir(sim_dir) @@ -1611,40 +1652,47 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): local_fs = LocalFileSystem() reporting_measures = cls.get_reporting_measures(cfg) dpouts = [] - simulation_output_tar_filename = sim_dir.parent / 'simulation_outputs.tar.gz' - with tarfile.open(str(simulation_output_tar_filename), 'w:gz') as simout_tar: - for building_id, upgrade_idx in jobs_d['batch']: + simulation_output_tar_filename = sim_dir.parent / "simulation_outputs.tar.gz" + with tarfile.open(str(simulation_output_tar_filename), "w:gz") as simout_tar: + for building_id, upgrade_idx in jobs_d["batch"]: upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1 - sim_id = f'bldg{building_id:07d}up{upgrade_id:02d}' + sim_id = f"bldg{building_id:07d}up{upgrade_id:02d}" # Create OSW - osw = cls.create_osw(cfg, jobs_d['n_datapoints'], sim_id, building_id, upgrade_idx) - with open(os.path.join(sim_dir, 'in.osw'), 'w') as f: + osw = cls.create_osw( + cfg, jobs_d["n_datapoints"], sim_id, building_id, upgrade_idx + ) + with open(os.path.join(sim_dir, "in.osw"), "w") as f: json.dump(osw, f, indent=4) # Run Simulation - with open(sim_dir / 'os_stdout.log', 'w') as f_out: + with open(sim_dir / "os_stdout.log", "w") as f_out: try: - logger.debug('Running {}'.format(sim_id)) - cli_cmd = ['openstudio', 'run', '-w', 'in.osw'] - if cfg.get('baseline', dict()).get('custom_gems', False): + logger.debug("Running {}".format(sim_id)) + cli_cmd = ["openstudio", "run", "-w", "in.osw"] + if cfg.get("baseline", dict()).get("custom_gems", False): cli_cmd = [ - 'openstudio', - '--bundle', '/var/oscli/Gemfile', - '--bundle_path', '/var/oscli/gems', - '--bundle_without', 'native_ext', - 'run', '-w', 'in.osw', - '--debug' + "openstudio", + "--bundle", + "/var/oscli/Gemfile", + "--bundle_path", + "/var/oscli/gems", + "--bundle_without", + "native_ext", + "run", + "-w", + "in.osw", + "--debug", ] subprocess.run( cli_cmd, check=True, stdout=f_out, stderr=subprocess.STDOUT, - cwd=str(sim_dir) + cwd=str(sim_dir), ) except subprocess.CalledProcessError: - logger.debug(f'Simulation failed: see {sim_id}/os_stdout.log') + logger.debug(f"Simulation failed: see {sim_id}/os_stdout.log") # Clean Up simulation directory cls.cleanup_sim_dir( @@ -1652,7 +1700,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): fs, f"{bucket}/{prefix}/results/simulation_output/timeseries", upgrade_id, - building_id + building_id, ) # Read data_point_out.json @@ -1662,7 +1710,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): dpouts.append(dpout) # Add the rest of the simulation outputs to the tar archive - logger.info('Archiving simulation outputs') + logger.info("Archiving simulation outputs") for dirpath, dirnames, filenames in os.walk(sim_dir): if dirpath == str(sim_dir): for dirname in set(dirnames).intersection(asset_dirs): @@ -1673,7 +1721,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): simout_tar.add(abspath, os.path.join(sim_id, relpath)) # Clear directory for next simulation - logger.debug('Clearing out simulation directory') + logger.debug("Clearing out simulation directory") for item in set(os.listdir(sim_dir)).difference(asset_dirs): if os.path.isdir(item): shutil.rmtree(item) @@ -1683,12 +1731,15 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): # Upload simulation outputs tarfile to s3 fs.put( str(simulation_output_tar_filename), - f'{bucket}/{prefix}/results/simulation_output/simulations_job{job_id}.tar.gz' + f"{bucket}/{prefix}/results/simulation_output/simulations_job{job_id}.tar.gz", ) # Upload aggregated dpouts as a json file - with fs.open(f'{bucket}/{prefix}/results/simulation_output/results_job{job_id}.json.gz', 'wb') as f1: - with gzip.open(f1, 'wt', encoding='utf-8') as f2: + with fs.open( + f"{bucket}/{prefix}/results/simulation_output/results_job{job_id}.json.gz", + "wb", + ) as f1: + with gzip.open(f1, "wt", encoding="utf-8") as f2: json.dump(dpouts, f2) # Remove files (it helps docker if we don't leave a bunch of files laying around) @@ -1705,7 +1756,9 @@ def get_fs(self): def get_dask_client(self): dask_cfg = self.cfg["aws"]["dask"] - batch_env = AwsBatchEnv(self.job_identifier, self.cfg['aws'], self.boto3_session) + batch_env = AwsBatchEnv( + self.job_identifier, self.cfg["aws"], self.boto3_session + ) m = 1024 self.dask_cluster = FargateCluster( region_name=self.region, @@ -1717,8 +1770,8 @@ def get_dask_client(self): worker_cpu=dask_cfg.get("worker_cpu", 2 * m), worker_mem=dask_cfg.get("worker_memory", 8 * m), n_workers=dask_cfg["n_workers"], - task_role_policies=['arn:aws:iam::aws:policy/AmazonS3FullAccess'], - tags=batch_env.get_tags() + task_role_policies=["arn:aws:iam::aws:policy/AmazonS3FullAccess"], + tags=batch_env.get_tags(), ) self.dask_client = Client(self.dask_cluster) return self.dask_client @@ -1734,67 +1787,70 @@ def upload_results(self, *args, **kwargs): @log_error_details() def main(): - logging.config.dictConfig({ - 'version': 1, - 'disable_existing_loggers': True, - 'formatters': { - 'defaultfmt': { - 'format': '%(levelname)s:%(asctime)s:%(name)s:%(message)s', - 'datefmt': '%Y-%m-%d %H:%M:%S' - } - }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'formatter': 'defaultfmt', - 'level': 'DEBUG', - 'stream': 'ext://sys.stdout', - } - }, - 'loggers': { - '__main__': { - 'level': 'DEBUG', - 'propagate': True, - 'handlers': ['console'] + logging.config.dictConfig( + { + "version": 1, + "disable_existing_loggers": True, + "formatters": { + "defaultfmt": { + "format": "%(levelname)s:%(asctime)s:%(name)s:%(message)s", + "datefmt": "%Y-%m-%d %H:%M:%S", + } }, - 'buildstockbatch': { - 'level': 'DEBUG', - 'propagate': True, - 'handlers': ['console'] + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "defaultfmt", + "level": "DEBUG", + "stream": "ext://sys.stdout", + } }, - }, - }) + "loggers": { + "__main__": { + "level": "DEBUG", + "propagate": True, + "handlers": ["console"], + }, + "buildstockbatch": { + "level": "DEBUG", + "propagate": True, + "handlers": ["console"], + }, + }, + } + ) print(AwsBatch.LOGO) - if 'AWS_BATCH_JOB_ARRAY_INDEX' in os.environ: - job_id = int(os.environ['AWS_BATCH_JOB_ARRAY_INDEX']) - s3_bucket = os.environ['S3_BUCKET'] - s3_prefix = os.environ['S3_PREFIX'] - job_name = os.environ['JOB_NAME'] - region = os.environ['REGION'] + if "AWS_BATCH_JOB_ARRAY_INDEX" in os.environ: + job_id = int(os.environ["AWS_BATCH_JOB_ARRAY_INDEX"]) + s3_bucket = os.environ["S3_BUCKET"] + s3_prefix = os.environ["S3_PREFIX"] + job_name = os.environ["JOB_NAME"] + region = os.environ["REGION"] AwsBatch.run_job(job_id, s3_bucket, s3_prefix, job_name, region) else: parser = argparse.ArgumentParser() - parser.add_argument('project_filename') + parser.add_argument("project_filename") group = parser.add_mutually_exclusive_group() group.add_argument( - '-c', '--clean', - action='store_true', - help='After the simulation is done, run with --clean to clean up AWS environment' + "-c", + "--clean", + action="store_true", + help="After the simulation is done, run with --clean to clean up AWS environment", ) group.add_argument( - '--validateonly', - help='Only validate the project YAML file and references. Nothing is executed', - action='store_true' + "--validateonly", + help="Only validate the project YAML file and references. Nothing is executed", + action="store_true", ) group.add_argument( - '--postprocessonly', - help='Only do postprocessing, useful for when the simulations are already done', - action='store_true' + "--postprocessonly", + help="Only do postprocessing, useful for when the simulations are already done", + action="store_true", ) group.add_argument( - '--crawl', - help='Only do the crawling in Athena. When simulations and postprocessing are done.', - action='store_true' + "--crawl", + help="Only do the crawling in Athena. When simulations and postprocessing are done.", + action="store_true", ) args = parser.parse_args() @@ -1820,5 +1876,5 @@ def main(): batch.clean() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index b5155033..01193726 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -4,28 +4,30 @@ logger = logging.getLogger(__name__) -boto_client_config = Config( - retries={ - "max_attempts": 5, - "mode": "standard" - } -) +boto_client_config = Config(retries={"max_attempts": 5, "mode": "standard"}) -class AWSIAMHelper(): +class AWSIAMHelper: logger.propagate = False def __init__(self, session): - ''' + """ Initialize the AWSIAM class with a boto3 Session :param session: boto3 Session from 'parent' job base class - ''' + """ self.session = session - self.iam = self.session.client('iam', config=boto_client_config) - - def role_stitcher(self, role_name, trust_service, description, policies_list=[], managed_policie_arns=[]): - ''' + self.iam = self.session.client("iam", config=boto_client_config) + + def role_stitcher( + self, + role_name, + trust_service, + description, + policies_list=[], + managed_policie_arns=[], + ): + """ Creates a role and attached the policies - will catch errors and skip if role already exists :param role_name: Name of service role to create :param trust_service: Trusted service to associate with the service role @@ -33,9 +35,9 @@ def role_stitcher(self, role_name, trust_service, description, policies_list=[], :param policies_list: List of JSON policies (optional) :param managed_policie_arns: Managed policies to attach (optional) :return: Role ARN is returned - ''' + """ role_arn = None - trust_policy = f'''{{ + trust_policy = f"""{{ "Version": "2012-10-17", "Statement": [{{ "Effect": "Allow", @@ -45,99 +47,86 @@ def role_stitcher(self, role_name, trust_service, description, policies_list=[], "Action": "sts:AssumeRole" }}] }} - ''' + """ try: response = self.iam.create_role( - Path='/', + Path="/", RoleName=role_name, AssumeRolePolicyDocument=trust_policy, - Description=description + Description=description, ) - role_arn = response['Role']['Arn'] + role_arn = response["Role"]["Arn"] p_counter = 1 for policy in policies_list: response = self.iam.put_role_policy( RoleName=role_name, - PolicyName=f'{role_name}_policy_{p_counter}', - PolicyDocument=policy + PolicyName=f"{role_name}_policy_{p_counter}", + PolicyDocument=policy, ) p_counter = p_counter + 1 for managed_policy_arn in managed_policie_arns: response = self.iam.attach_role_policy( - PolicyArn=managed_policy_arn, - RoleName=role_name + PolicyArn=managed_policy_arn, RoleName=role_name ) - logger.info(f'Role {role_name} created') + logger.info(f"Role {role_name} created") return role_arn except Exception as e: - if 'EntityAlreadyExists' in str(e): - logger.info(f'Role {role_name} not created - already exists') - response = self.iam.get_role( - RoleName=role_name - ) - role_arn = response['Role']['Arn'] + if "EntityAlreadyExists" in str(e): + logger.info(f"Role {role_name} not created - already exists") + response = self.iam.get_role(RoleName=role_name) + role_arn = response["Role"]["Arn"] return role_arn else: raise def delete_role(self, role_name): - ''' + """ Delete a role :param role_name: name of the role to delete :return: None - ''' + """ try: - response = self.iam.list_role_policies( - RoleName=role_name - ) + response = self.iam.list_role_policies(RoleName=role_name) - for policy in response['PolicyNames']: - self.iam.delete_role_policy( - RoleName=role_name, - PolicyName=policy - ) + for policy in response["PolicyNames"]: + self.iam.delete_role_policy(RoleName=role_name, PolicyName=policy) - response = self.iam.list_attached_role_policies( - RoleName=role_name - ) + response = self.iam.list_attached_role_policies(RoleName=role_name) - for policy in response['AttachedPolicies']: + for policy in response["AttachedPolicies"]: self.iam.detach_role_policy( - RoleName=role_name, - PolicyArn=policy['PolicyArn'] - ) + RoleName=role_name, PolicyArn=policy["PolicyArn"] + ) - logger.info(f'Policies detached from role {role_name}.') + logger.info(f"Policies detached from role {role_name}.") - response = self.iam.delete_role( - RoleName=role_name - ) - logger.info(f'Role {role_name} deleted.') + response = self.iam.delete_role(RoleName=role_name) + logger.info(f"Role {role_name} deleted.") except Exception as e: - if 'NoSuchEntity' in str(e): - logger.info(f'Role {role_name} missing, skipping...') + if "NoSuchEntity" in str(e): + logger.info(f"Role {role_name} missing, skipping...") else: raise def delete_instance_profile(self, instance_profile_name): try: - self.iam.delete_instance_profile( - InstanceProfileName=instance_profile_name - ) + self.iam.delete_instance_profile(InstanceProfileName=instance_profile_name) logger.info(f"Instance profile {instance_profile_name} deleted.") except Exception as e: - if 'NoSuchEntity' in str(e): - logger.info(f"Instance profile {instance_profile_name} missing, skipping...") + if "NoSuchEntity" in str(e): + logger.info( + f"Instance profile {instance_profile_name} missing, skipping..." + ) else: raise @@ -147,20 +136,21 @@ def remove_role_from_instance_profile(self, instance_profile_name): InstanceProfileName=instance_profile_name ) - for role in response['InstanceProfile']['Roles']: + for role in response["InstanceProfile"]["Roles"]: response = self.iam.remove_role_from_instance_profile( - InstanceProfileName=instance_profile_name, - RoleName=role['RoleName'] + InstanceProfileName=instance_profile_name, RoleName=role["RoleName"] ) logger.info(f"Roles removed from instance profile {instance_profile_name}") except Exception as e: - if 'NoSuchEntity' in str(e): - logger.info(f"Instance profile {instance_profile_name} does not exist. Skipping...") + if "NoSuchEntity" in str(e): + logger.info( + f"Instance profile {instance_profile_name} does not exist. Skipping..." + ) else: raise -class AwsJobBase(): +class AwsJobBase: logger.propagate = False @@ -169,19 +159,27 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.session = boto3_session self.iam_helper = AWSIAMHelper(self.session) self.iam = self.iam_helper.iam - self.s3 = self.session.client('s3', config=boto_client_config) + self.s3 = self.session.client("s3", config=boto_client_config) self.job_identifier = job_identifier - self.account = self.session.client('sts', config=boto_client_config).get_caller_identity().get('Account') - self.region = aws_config['region'] - self.operator_email = aws_config['notifications_email'] + self.account = ( + self.session.client("sts", config=boto_client_config) + .get_caller_identity() + .get("Account") + ) + self.region = aws_config["region"] + self.operator_email = aws_config["notifications_email"] # S3 - self.s3_bucket = aws_config['s3']['bucket'] + self.s3_bucket = aws_config["s3"]["bucket"] self.s3_bucket_arn = f"arn:aws:s3:::{self.s3_bucket}" - self.s3_bucket_prefix = aws_config['s3']['prefix'].rstrip('/') - self.s3_lambda_code_emr_cluster_key = f'{self.s3_bucket_prefix}/lambda_functions/emr_function.py.zip' - self.s3_lambda_emr_config_key = f'{self.s3_bucket_prefix}/lambda_functions/emr_config.json' - self.s3_emr_folder_name = 'emr' + self.s3_bucket_prefix = aws_config["s3"]["prefix"].rstrip("/") + self.s3_lambda_code_emr_cluster_key = ( + f"{self.s3_bucket_prefix}/lambda_functions/emr_function.py.zip" + ) + self.s3_lambda_emr_config_key = ( + f"{self.s3_bucket_prefix}/lambda_functions/emr_config.json" + ) + self.s3_emr_folder_name = "emr" # Batch self.batch_compute_environment_name = f"computeenvionment_{self.job_identifier}" @@ -189,32 +187,34 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.batch_job_queue_name = f"job_queue_{self.job_identifier}" self.batch_service_role_name = f"batch_service_role_{self.job_identifier}" self.batch_instance_role_name = f"batch_instance_role_{self.job_identifier}" - self.batch_instance_profile_name = f"batch_instance_profile_{self.job_identifier}" + self.batch_instance_profile_name = ( + f"batch_instance_profile_{self.job_identifier}" + ) self.batch_spot_service_role_name = f"spot_fleet_role_{self.job_identifier}" self.batch_ecs_task_role_name = f"ecs_task_role_{self.job_identifier}" self.batch_task_policy_name = f"ecs_task_policy_{self.job_identifier}" - self.batch_use_spot = aws_config.get('use_spot', True) - self.batch_spot_bid_percent = aws_config.get('spot_bid_percent', 100) + self.batch_use_spot = aws_config.get("use_spot", True) + self.batch_spot_bid_percent = aws_config.get("spot_bid_percent", 100) # VPC self.vpc_name = self.job_identifier - self.vpc_id = '' # will be available after VPC creation - self.priv_subnet_cidr_1 = '' # will be available after VPC creation - self.priv_vpc_subnet_id_1 = 'REPL' # will be available after VPC creation - self.priv_vpc_subnet_id_2 = 'REPL' # will be available after VPC creation + self.vpc_id = "" # will be available after VPC creation + self.priv_subnet_cidr_1 = "" # will be available after VPC creation + self.priv_vpc_subnet_id_1 = "REPL" # will be available after VPC creation + self.priv_vpc_subnet_id_2 = "REPL" # will be available after VPC creation def get_tags(self, **kwargs): tags = kwargs.copy() - tags.update(self.aws_config.get('tags', {})) + tags.update(self.aws_config.get("tags", {})) return tags def get_tags_uppercase(self, **kwargs): tags = self.get_tags(**kwargs) - return [{'Key': k, 'Value': v} for k, v in tags.items()] + return [{"Key": k, "Value": v} for k, v in tags.items()] def get_tags_lowercase(self, _caps=True, **kwargs): tags = self.get_tags(**kwargs) - return [{'key': k, 'value': v} for k, v in tags.items()] + return [{"key": k, "value": v} for k, v in tags.items()] def __repr__(self): diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index 5a326d66..aa457a84 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -30,11 +30,7 @@ import pprint from buildstockbatch.__version__ import __schema_version__ -from buildstockbatch import ( - sampler, - workflow_generator, - postprocessing -) +from buildstockbatch import sampler, workflow_generator, postprocessing from buildstockbatch.exc import SimulationExists, ValidationError from buildstockbatch.utils import path_rel_to_file, get_project_configuration, read_csv from buildstockbatch.__version__ import __version__ as bsb_version @@ -45,17 +41,17 @@ class BuildStockBatchBase(object): # http://openstudio-builds.s3-website-us-east-1.amazonaws.com - DEFAULT_OS_VERSION = '3.6.1' - DEFAULT_OS_SHA = 'bb9481519e' + DEFAULT_OS_VERSION = "3.6.1" + DEFAULT_OS_SHA = "bb9481519e" CONTAINER_RUNTIME = None - LOGO = ''' + LOGO = """ _ __ _ __, _ __ ( / ) o // /( _/_ / ( / ) _/_ / /--< , ,, // __/ `. / __ _, /< /--< __, / _, / /___/(_/_(_(/_(_/_(___)(__(_)(__/ |_/___/(_/(_(__(__/ /_ Executing BuildStock projects with grace since 2018 -''' +""" def __init__(self, project_filename): self.project_filename = os.path.abspath(project_filename) @@ -63,35 +59,47 @@ def __init__(self, project_filename): # Load project file to self.cfg self.cfg = get_project_configuration(project_filename) - self.buildstock_dir = self.cfg['buildstock_directory'] + self.buildstock_dir = self.cfg["buildstock_directory"] if not os.path.isdir(self.buildstock_dir): - raise FileNotFoundError(f'buildstock_directory = {self.buildstock_dir} is not a directory.') - self.project_dir = os.path.join(self.buildstock_dir, self.cfg['project_directory']) + raise FileNotFoundError( + f"buildstock_directory = {self.buildstock_dir} is not a directory." + ) + self.project_dir = os.path.join( + self.buildstock_dir, self.cfg["project_directory"] + ) if not os.path.isdir(self.project_dir): - raise FileNotFoundError(f'project_directory = {self.project_dir} is not a directory.') + raise FileNotFoundError( + f"project_directory = {self.project_dir} is not a directory." + ) # Load in OS_VERSION and OS_SHA arguments if they exist in the YAML, # otherwise use defaults specified here. - self.os_version = self.cfg.get('os_version', self.DEFAULT_OS_VERSION) - self.os_sha = self.cfg.get('os_sha', self.DEFAULT_OS_SHA) - logger.debug(f"Using OpenStudio version: {self.os_version} with SHA: {self.os_sha}") + self.os_version = self.cfg.get("os_version", self.DEFAULT_OS_VERSION) + self.os_sha = self.cfg.get("os_sha", self.DEFAULT_OS_SHA) + logger.debug( + f"Using OpenStudio version: {self.os_version} with SHA: {self.os_sha}" + ) @staticmethod def get_sampler_class(sampler_name): - sampler_class_name = ''.join(x.capitalize() for x in sampler_name.strip().split('_')) + 'Sampler' + sampler_class_name = ( + "".join(x.capitalize() for x in sampler_name.strip().split("_")) + "Sampler" + ) return getattr(sampler, sampler_class_name) @staticmethod def get_workflow_generator_class(workflow_generator_name): - workflow_generator_class_name = \ - ''.join(x.capitalize() for x in workflow_generator_name.strip().split('_')) + 'WorkflowGenerator' + workflow_generator_class_name = ( + "".join(x.capitalize() for x in workflow_generator_name.strip().split("_")) + + "WorkflowGenerator" + ) return getattr(workflow_generator, workflow_generator_class_name) @property def sampler(self): # Select a sampler - Sampler = self.get_sampler_class(self.cfg['sampler']['type']) - return Sampler(self, **self.cfg['sampler'].get('args', {})) + Sampler = self.get_sampler_class(self.cfg["sampler"]["type"]) + return Sampler(self, **self.cfg["sampler"].get("args", {})) @staticmethod def openstudio_exe(): @@ -101,22 +109,24 @@ def path_rel_to_projectfile(self, x): return path_rel_to_file(self.project_filename, x) def _get_weather_files(self): - if 'weather_files_path' in self.cfg: - logger.debug('Copying weather files') - weather_file_path = self.cfg['weather_files_path'] - with zipfile.ZipFile(weather_file_path, 'r') as zf: - logger.debug('Extracting weather files to: {}'.format(self.weather_dir)) + if "weather_files_path" in self.cfg: + logger.debug("Copying weather files") + weather_file_path = self.cfg["weather_files_path"] + with zipfile.ZipFile(weather_file_path, "r") as zf: + logger.debug("Extracting weather files to: {}".format(self.weather_dir)) zf.extractall(self.weather_dir) else: - logger.debug('Downloading weather files') - r = requests.get(self.cfg['weather_files_url'], stream=True) - with tempfile.TemporaryFile(dir=os.environ.get('LOCAL_SCRATCH')) as f: + logger.debug("Downloading weather files") + r = requests.get(self.cfg["weather_files_url"], stream=True) + with tempfile.TemporaryFile(dir=os.environ.get("LOCAL_SCRATCH")) as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.seek(0) - with zipfile.ZipFile(f, 'r') as zf: - logger.debug('Extracting weather files to: {}'.format(self.weather_dir)) + with zipfile.ZipFile(f, "r") as zf: + logger.debug( + "Extracting weather files to: {}".format(self.weather_dir) + ) zf.extractall(self.weather_dir) @property @@ -133,13 +143,17 @@ def output_dir(self): @property def skip_baseline_sims(self): - baseline_skip = self.cfg['baseline'].get('skip_sims', False) + baseline_skip = self.cfg["baseline"].get("skip_sims", False) return baseline_skip @classmethod def get_reporting_measures(cls, cfg): - WorkflowGenerator = cls.get_workflow_generator_class(cfg['workflow_generator']['type']) - wg = WorkflowGenerator(cfg, 1) # Number of datapoints doesn't really matter here + WorkflowGenerator = cls.get_workflow_generator_class( + cfg["workflow_generator"]["type"] + ) + wg = WorkflowGenerator( + cfg, 1 + ) # Number of datapoints doesn't really matter here return wg.reporting_measures() def run_batch(self): @@ -147,22 +161,34 @@ def run_batch(self): @classmethod def create_osw(cls, cfg, n_datapoints, *args, **kwargs): - WorkflowGenerator = cls.get_workflow_generator_class(cfg['workflow_generator']['type']) + WorkflowGenerator = cls.get_workflow_generator_class( + cfg["workflow_generator"]["type"] + ) osw_generator = WorkflowGenerator(cfg, n_datapoints) return osw_generator.create_osw(*args, **kwargs) @staticmethod def make_sim_dir(building_id, upgrade_idx, base_dir, overwrite_existing=False): real_upgrade_idx = 0 if upgrade_idx is None else upgrade_idx + 1 - sim_id = 'bldg{:07d}up{:02d}'.format(building_id, real_upgrade_idx) + sim_id = "bldg{:07d}up{:02d}".format(building_id, real_upgrade_idx) # Check to see if the simulation is done already and skip it if so. - sim_dir = os.path.join(base_dir, 'up{:02d}'.format(real_upgrade_idx), 'bldg{:07d}'.format(building_id)) + sim_dir = os.path.join( + base_dir, + "up{:02d}".format(real_upgrade_idx), + "bldg{:07d}".format(building_id), + ) if os.path.exists(sim_dir) and not overwrite_existing: - if os.path.exists(os.path.join(sim_dir, 'run', 'finished.job')): - raise SimulationExists('{} exists and finished successfully'.format(sim_id), sim_id, sim_dir) - elif os.path.exists(os.path.join(sim_dir, 'run', 'failed.job')): - raise SimulationExists('{} exists and failed'.format(sim_id), sim_id, sim_dir) + if os.path.exists(os.path.join(sim_dir, "run", "finished.job")): + raise SimulationExists( + "{} exists and finished successfully".format(sim_id), + sim_id, + sim_dir, + ) + elif os.path.exists(os.path.join(sim_dir, "run", "failed.job")): + raise SimulationExists( + "{} exists and failed".format(sim_id), sim_id, sim_dir + ) else: shutil.rmtree(sim_dir) @@ -189,48 +215,56 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id): # Convert the timeseries data to parquet # and copy it to the results directory - timeseries_filepath = os.path.join(sim_dir, 'run', 'results_timeseries.csv') + timeseries_filepath = os.path.join(sim_dir, "run", "results_timeseries.csv") # FIXME: Allowing both names here for compatibility. Should consolidate on one timeseries filename. if os.path.isfile(timeseries_filepath): units_dict = read_csv(timeseries_filepath, nrows=1).transpose().to_dict()[0] skiprows = [1] else: - timeseries_filepath = os.path.join(sim_dir, 'run', 'enduse_timeseries.csv') + timeseries_filepath = os.path.join(sim_dir, "run", "enduse_timeseries.csv") units_dict = {} skiprows = [] - schedules_filepath = '' - if os.path.isdir(os.path.join(sim_dir, 'generated_files')): - for file in os.listdir(os.path.join(sim_dir, 'generated_files')): + schedules_filepath = "" + if os.path.isdir(os.path.join(sim_dir, "generated_files")): + for file in os.listdir(os.path.join(sim_dir, "generated_files")): if re.match(r".*schedules.*\.csv", file): - schedules_filepath = os.path.join(sim_dir, 'generated_files', file) + schedules_filepath = os.path.join(sim_dir, "generated_files", file) if os.path.isfile(timeseries_filepath): # Find the time columns present in the enduse_timeseries file - possible_time_cols = ['time', 'Time', 'TimeDST', 'TimeUTC'] - cols = read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist() + possible_time_cols = ["time", "Time", "TimeDST", "TimeUTC"] + cols = read_csv( + timeseries_filepath, index_col=False, nrows=0 + ).columns.tolist() actual_time_cols = [c for c in cols if c in possible_time_cols] if not actual_time_cols: - logger.error(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.') - raise RuntimeError(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.') + logger.error( + f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}." + ) + raise RuntimeError( + f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}." + ) - tsdf = read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows) + tsdf = read_csv( + timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows + ) if os.path.isfile(schedules_filepath): schedules = read_csv(schedules_filepath, dtype=np.float64) - schedules.rename(columns=lambda x: f'schedules_{x}', inplace=True) - schedules['TimeDST'] = tsdf['Time'] - tsdf = tsdf.merge(schedules, how='left', on='TimeDST') + schedules.rename(columns=lambda x: f"schedules_{x}", inplace=True) + schedules["TimeDST"] = tsdf["Time"] + tsdf = tsdf.merge(schedules, how="left", on="TimeDST") def get_clean_column_name(x): - """" + """ " Will rename column names like End Use: Natural Gas: Range/Oven to end_use__natural_gas__range_oven__kbtu to play nice with Athena """ unit = units_dict.get(x) # missing units (e.g. for time) gets nan - unit = unit if isinstance(unit, str) else '' - sepecial_characters = [':', ' ', '/'] + unit = unit if isinstance(unit, str) else "" + sepecial_characters = [":", " ", "/"] for char in sepecial_characters: - x = x.replace(char, '_') + x = x.replace(char, "_") x = x + "__" + unit if unit else x return x.lower() @@ -238,20 +272,23 @@ def get_clean_column_name(x): postprocessing.write_dataframe_as_parquet( tsdf, dest_fs, - f'{simout_ts_dir}/up{upgrade_id:02d}/bldg{building_id:07d}.parquet' + f"{simout_ts_dir}/up{upgrade_id:02d}/bldg{building_id:07d}.parquet", ) # Remove files already in data_point.zip - zipfilename = os.path.join(sim_dir, 'run', 'data_point.zip') + zipfilename = os.path.join(sim_dir, "run", "data_point.zip") if os.path.isfile(zipfilename): - with zipfile.ZipFile(zipfilename, 'r') as zf: + with zipfile.ZipFile(zipfilename, "r") as zf: for filename in zf.namelist(): - for filepath in (os.path.join(sim_dir, 'run', filename), os.path.join(sim_dir, filename)): + for filepath in ( + os.path.join(sim_dir, "run", filename), + os.path.join(sim_dir, filename), + ): if os.path.exists(filepath): os.remove(filepath) # Remove reports dir - reports_dir = os.path.join(sim_dir, 'reports') + reports_dir = os.path.join(sim_dir, "reports") if os.path.isdir(reports_dir): shutil.rmtree(reports_dir, ignore_errors=True) @@ -270,7 +307,7 @@ def validate_project(cls, project_file): assert cls.validate_resstock_or_comstock_version(project_file) assert cls.validate_openstudio_version(project_file) assert cls.validate_number_of_options(project_file) - logger.info('Base Validation Successful') + logger.info("Base Validation Successful") return True @staticmethod @@ -279,26 +316,32 @@ def get_buildstock_dir(project_file, cfg): if os.path.isabs(buildstock_dir): return os.path.abspath(buildstock_dir) else: - return os.path.abspath(os.path.join(os.path.dirname(project_file), buildstock_dir)) + return os.path.abspath( + os.path.join(os.path.dirname(project_file), buildstock_dir) + ) @classmethod def validate_openstudio_path(cls, project_file): cfg = get_project_configuration(project_file) - os_version = cfg.get('os_version', cls.DEFAULT_OS_VERSION) - os_sha = cfg.get('os_sha', cls.DEFAULT_OS_SHA) + os_version = cfg.get("os_version", cls.DEFAULT_OS_VERSION) + os_sha = cfg.get("os_sha", cls.DEFAULT_OS_SHA) try: proc_out = subprocess.run( [cls.openstudio_exe(), "openstudio_version"], capture_output=True, - text=True + text=True, ) except FileNotFoundError: raise ValidationError(f"Cannot find openstudio at `{cls.openstudio_exe()}`") if proc_out.returncode != 0: - raise ValidationError(f"OpenStudio failed with the following error {proc_out.stderr}") + raise ValidationError( + f"OpenStudio failed with the following error {proc_out.stderr}" + ) actual_os_version, actual_os_sha = proc_out.stdout.strip().split("+") if os_version != actual_os_version: - raise ValidationError(f"OpenStudio version is {actual_os_version}, expected is {os_version}") + raise ValidationError( + f"OpenStudio version is {actual_os_version}, expected is {os_version}" + ) if os_sha != actual_os_sha: raise ValidationError( f"OpenStudio version is correct at {os_version}, but the shas don't match. " @@ -309,21 +352,23 @@ def validate_openstudio_path(cls, project_file): @staticmethod def validate_sampler(project_file): cfg = get_project_configuration(project_file) - sampler_name = cfg['sampler']['type'] + sampler_name = cfg["sampler"]["type"] try: Sampler = BuildStockBatchBase.get_sampler_class(sampler_name) except AttributeError: - raise ValidationError(f'Sampler class `{sampler_name}` is not available.') - args = cfg['sampler']['args'] + raise ValidationError(f"Sampler class `{sampler_name}` is not available.") + args = cfg["sampler"]["args"] Sampler.validate_args(project_file, **args) if issubclass(Sampler, sampler.PrecomputedSampler): - sample_file = cfg['sampler']['args']['sample_file'] + sample_file = cfg["sampler"]["args"]["sample_file"] if not os.path.isabs(sample_file): sample_file = os.path.join(os.path.dirname(project_file), sample_file) else: sample_file = os.path.abspath(sample_file) buildstock_df = read_csv(sample_file, dtype=str) - return BuildStockBatchBase.validate_buildstock_csv(project_file, buildstock_df) + return BuildStockBatchBase.validate_buildstock_csv( + project_file, buildstock_df + ) return True @staticmethod @@ -333,38 +378,48 @@ def validate_buildstock_csv(project_file, buildstock_df): # param_option_dict has format: {column_name: [valid_option1, valid_option2, ...], ...} errors = [] for column in buildstock_df.columns: - if column in {'Building'}: + if column in {"Building"}: continue if column not in param_option_dict: - errors.append(f'Column {column} in buildstock_csv is not available in options_lookup.tsv') + errors.append( + f"Column {column} in buildstock_csv is not available in options_lookup.tsv" + ) continue if "*" in param_option_dict[column]: continue # skip validating options when wildcard is present for option in buildstock_df[column].unique(): if option not in param_option_dict[column]: - errors.append(f'Option {option} in column {column} of buildstock_csv is not available ' - 'in options_lookup.tsv') + errors.append( + f"Option {option} in column {column} of buildstock_csv is not available " + "in options_lookup.tsv" + ) if errors: - raise ValidationError('\n'.join(errors)) + raise ValidationError("\n".join(errors)) return True @classmethod def validate_workflow_generator(cls, project_file): cfg = get_project_configuration(project_file) - WorkflowGenerator = cls.get_workflow_generator_class(cfg['workflow_generator']['type']) + WorkflowGenerator = cls.get_workflow_generator_class( + cfg["workflow_generator"]["type"] + ) return WorkflowGenerator.validate(cfg) @staticmethod def validate_project_schema(project_file): cfg = get_project_configuration(project_file) - schema_version = cfg.get('schema_version') - version_schema = os.path.join(os.path.dirname(__file__), 'schemas', f'v{schema_version}.yaml') + schema_version = cfg.get("schema_version") + version_schema = os.path.join( + os.path.dirname(__file__), "schemas", f"v{schema_version}.yaml" + ) if not os.path.isfile(version_schema): - logger.error(f'Could not find validation schema for YAML version {schema_version}') + logger.error( + f"Could not find validation schema for YAML version {schema_version}" + ) raise FileNotFoundError(version_schema) schema = yamale.make_schema(version_schema) - data = yamale.make_data(project_file, parser='ruamel') + data = yamale.make_data(project_file, parser="ruamel") return yamale.validate(schema, data, strict=True) @staticmethod @@ -378,22 +433,27 @@ def validate_misc_constraints(project_file): def validate_postprocessing_spec(project_file): cfg = get_project_configuration(project_file) # noqa F841 param_option_dict, _ = BuildStockBatchBase.get_param_option_dict(project_file) - partition_cols = cfg.get('postprocessing', {}).get("partition_columns", []) + partition_cols = cfg.get("postprocessing", {}).get("partition_columns", []) invalid_cols = [c for c in partition_cols if c not in param_option_dict.keys()] if invalid_cols: - raise ValidationError(f"The following partition columns are not valid: {invalid_cols}") + raise ValidationError( + f"The following partition columns are not valid: {invalid_cols}" + ) return True @staticmethod def validate_xor_nor_schema_keys(project_file): cfg = get_project_configuration(project_file) - major, minor = cfg.get('version', __schema_version__).split('.') + major, minor = cfg.get("version", __schema_version__).split(".") if int(major) >= 0: if int(minor) >= 0: # xor - if ('weather_files_url' in cfg.keys()) is \ - ('weather_files_path' in cfg.keys()): - raise ValidationError('Both/neither weather_files_url and weather_files_path found in yaml root') + if ("weather_files_url" in cfg.keys()) is ( + "weather_files_path" in cfg.keys() + ): + raise ValidationError( + "Both/neither weather_files_url and weather_files_path found in yaml root" + ) return True @@ -402,25 +462,34 @@ def get_param_option_dict(project_file): cfg = get_project_configuration(project_file) param_option_dict = defaultdict(set) buildstock_dir = BuildStockBatchBase.get_buildstock_dir(project_file, cfg) - options_lookup_path = f'{buildstock_dir}/resources/options_lookup.tsv' + options_lookup_path = f"{buildstock_dir}/resources/options_lookup.tsv" # fill in the param_option_dict with {'param1':['valid_option1','valid_option2' ...]} from options_lookup.tsv try: - with open(options_lookup_path, 'r') as f: - options = csv.DictReader(f, delimiter='\t') - invalid_options_lookup_str = '' # Holds option/parameter names with invalid characters + with open(options_lookup_path, "r") as f: + options = csv.DictReader(f, delimiter="\t") + invalid_options_lookup_str = ( + "" # Holds option/parameter names with invalid characters + ) for row in options: - for col in ['Parameter Name', 'Option Name']: - invalid_chars = set(row[col]).intersection(set('|&()')) - invalid_chars = ''.join(invalid_chars) + for col in ["Parameter Name", "Option Name"]: + invalid_chars = set(row[col]).intersection(set("|&()")) + invalid_chars = "".join(invalid_chars) if invalid_chars: invalid_options_lookup_str += f"{col}: '{row[col]}', Invalid chars: '{invalid_chars}' \n" - param_name, opt_name = row['Parameter Name'], row['Option Name'] - param_option_dict[row['Parameter Name']].add(row['Option Name']) - if opt_name == '*' and row['Measure Dir']: - invalid_options_lookup_str += f"{param_name}: '*' cannot pass arguments to measure.\n" - if "*" in param_option_dict[param_name] and len(param_option_dict[param_name]) > 1: - invalid_options_lookup_str += f"{param_name}: '*' cannot be mixed with other options\n" + param_name, opt_name = row["Parameter Name"], row["Option Name"] + param_option_dict[row["Parameter Name"]].add(row["Option Name"]) + if opt_name == "*" and row["Measure Dir"]: + invalid_options_lookup_str += ( + f"{param_name}: '*' cannot pass arguments to measure.\n" + ) + if ( + "*" in param_option_dict[param_name] + and len(param_option_dict[param_name]) > 1 + ): + invalid_options_lookup_str += ( + f"{param_name}: '*' cannot be mixed with other options\n" + ) except FileNotFoundError as err: logger.error(f"Options lookup file not found at: '{options_lookup_path}'") raise err @@ -432,7 +501,9 @@ def validate_options_lookup(project_file): Validates that the parameter|options specified in the project yaml file is available in the options_lookup.tsv """ cfg = get_project_configuration(project_file) - param_option_dict, invalid_options_lookup_str = BuildStockBatchBase.get_param_option_dict(project_file) + param_option_dict, invalid_options_lookup_str = ( + BuildStockBatchBase.get_param_option_dict(project_file) + ) invalid_option_spec_counter = Counter() invalid_param_counter = Counter() invalid_option_counter_dict = defaultdict(Counter) @@ -446,44 +517,59 @@ def get_errors(source_str, option_str): :return: returns empty string if the param|option is valid i.e. they are found in options_lookup.tsv if not returns error message, close matches, and specifies where the error occurred (source_str) """ - if '||' in option_str and '&&' in option_str: - invalid_option_spec_counter[(option_str, "has both || and && (not supported)")] += 1 + if "||" in option_str and "&&" in option_str: + invalid_option_spec_counter[ + (option_str, "has both || and && (not supported)") + ] += 1 return "" - if '||' in option_str or '&&' in option_str: - splitter = '||' if '||' in option_str else '&&' - errors = '' + if "||" in option_str or "&&" in option_str: + splitter = "||" if "||" in option_str else "&&" + errors = "" broken_options = option_str.split(splitter) - if broken_options[-1] == '': - invalid_option_spec_counter[(option_str, "has trailing 'splitter'")] += 1 + if broken_options[-1] == "": + invalid_option_spec_counter[ + (option_str, "has trailing 'splitter'") + ] += 1 return "" for broken_option_str in broken_options: new_source_str = source_str + f" in composite option '{option_str}'" errors += get_errors(new_source_str, broken_option_str) return errors - if not option_str or '|' == option_str: + if not option_str or "|" == option_str: return f"* Option name empty. {source_str}\n" try: - parameter_name, option_name = option_str.split('|') + parameter_name, option_name = option_str.split("|") except ValueError: - invalid_option_spec_counter[(option_str, "has has too many or too few '|' (exactly 1 required).")] += 1 + invalid_option_spec_counter[ + ( + option_str, + "has has too many or too few '|' (exactly 1 required).", + ) + ] += 1 return "" if parameter_name not in param_option_dict: - close_match = difflib.get_close_matches(parameter_name, param_option_dict.keys(), 1) + close_match = difflib.get_close_matches( + parameter_name, param_option_dict.keys(), 1 + ) close_match = close_match[0] if close_match else "" invalid_param_counter[(parameter_name, close_match)] += 1 return "" if not option_name or option_name not in param_option_dict[parameter_name]: - close_match = difflib.get_close_matches(option_name, list(param_option_dict[parameter_name]), 1) + close_match = difflib.get_close_matches( + option_name, list(param_option_dict[parameter_name]), 1 + ) close_match = close_match[0] if close_match else "" - invalid_option_counter_dict[parameter_name][(option_name, close_match)] += 1 + invalid_option_counter_dict[parameter_name][ + (option_name, close_match) + ] += 1 return "" - return '' + return "" def get_all_option_str(source_str, inp): """ @@ -499,46 +585,76 @@ def get_all_option_str(source_str, inp): if type(inp) == str: return [(source_str, inp)] elif type(inp) == list: - return sum([get_all_option_str(source_str + f", in entry {count}", entry) for count, entry - in enumerate(inp)], []) + return sum( + [ + get_all_option_str(source_str + f", in entry {count}", entry) + for count, entry in enumerate(inp) + ], + [], + ) elif type(inp) == dict: if len(inp) > 1: - raise ValidationError(f"{source_str} the logic is malformed. Dict can't have more than one entry") + raise ValidationError( + f"{source_str} the logic is malformed. Dict can't have more than one entry" + ) source_str += f", in {list(inp.keys())[0]}" - return sum([get_all_option_str(source_str, i) for i in inp.values()], []) + return sum( + [get_all_option_str(source_str, i) for i in inp.values()], [] + ) # store all of the option_str in the project file as a list of (source_str, option_str) tuple source_option_str_list = [] - if 'upgrades' in cfg: - for upgrade_count, upgrade in enumerate(cfg['upgrades']): - upgrade_name = upgrade.get('upgrade_name', '') + f' (Upgrade Number: {upgrade_count})' + if "upgrades" in cfg: + for upgrade_count, upgrade in enumerate(cfg["upgrades"]): + upgrade_name = ( + upgrade.get("upgrade_name", "") + + f" (Upgrade Number: {upgrade_count})" + ) source_str_upgrade = f"In upgrade '{upgrade_name}'" - for option_count, option in enumerate(upgrade['options']): - option_name = option.get('option', '') + f' (Option Number: {option_count})' - source_str_option = source_str_upgrade + f", in option '{option_name}'" - source_option_str_list.append((source_str_option, option.get('option'))) - if 'apply_logic' in option: + for option_count, option in enumerate(upgrade["options"]): + option_name = ( + option.get("option", "") + f" (Option Number: {option_count})" + ) + source_str_option = ( + source_str_upgrade + f", in option '{option_name}'" + ) + source_option_str_list.append( + (source_str_option, option.get("option")) + ) + if "apply_logic" in option: source_str_logic = source_str_option + ", in apply_logic" - source_option_str_list += get_all_option_str(source_str_logic, option['apply_logic']) + source_option_str_list += get_all_option_str( + source_str_logic, option["apply_logic"] + ) - if 'package_apply_logic' in upgrade: + if "package_apply_logic" in upgrade: source_str_package = source_str_upgrade + ", in package_apply_logic" - source_option_str_list += get_all_option_str(source_str_package, upgrade['package_apply_logic']) + source_option_str_list += get_all_option_str( + source_str_package, upgrade["package_apply_logic"] + ) # TODO: refactor this into Sampler.validate_args - if 'downselect' in cfg or "downselect" in cfg.get('sampler', {}).get('type'): + if "downselect" in cfg or "downselect" in cfg.get("sampler", {}).get("type"): source_str = "In downselect" - logic = cfg['downselect']['logic'] if 'downselect' in cfg else cfg['sampler']['args']['logic'] + logic = ( + cfg["downselect"]["logic"] + if "downselect" in cfg + else cfg["sampler"]["args"]["logic"] + ) source_option_str_list += get_all_option_str(source_str, logic) # Gather all the errors in the option_str, if any - error_message = '' + error_message = "" for source_str, option_str in source_option_str_list: error_message += get_errors(source_str, option_str) if error_message: - error_message = "Following option/parameter entries have problem:\n" + error_message + "\n" + error_message = ( + "Following option/parameter entries have problem:\n" + + error_message + + "\n" + ) if invalid_option_spec_counter: error_message += "* Following option/parameter entries have problem:\n" @@ -546,7 +662,9 @@ def get_all_option_str(source_str, inp): error_message += f" '{invalid_entry}' {error} - used '{count}' times\n" if invalid_param_counter: - error_message += "* Following parameters do not exist in options_lookup.tsv\n" + error_message += ( + "* Following parameters do not exist in options_lookup.tsv\n" + ) for (param, close_match), count in invalid_param_counter.items(): error_message += f" '{param}' - used '{count}' times." if close_match: @@ -565,8 +683,13 @@ def get_all_option_str(source_str, inp): error_message += "\n" if invalid_options_lookup_str: - error_message = "Following option/parameter names(s) have invalid characters in the options_lookup.tsv\n" +\ - invalid_options_lookup_str + "*"*80 + "\n" + error_message + error_message = ( + "Following option/parameter names(s) have invalid characters in the options_lookup.tsv\n" + + invalid_options_lookup_str + + "*" * 80 + + "\n" + + error_message + ) if not error_message: return True else: @@ -589,18 +712,20 @@ def validate_logic(project_file): printer = pprint.PrettyPrinter() def get_option(element): - return element.split('|')[0] if isinstance(element, str) else None + return element.split("|")[0] if isinstance(element, str) else None def get_logic_problems(logic, parent=None): if isinstance(logic, list): all_options = [opt for el in logic if (opt := get_option(el))] problems = [] - if parent in ['not', 'and', None, '&&']: + if parent in ["not", "and", None, "&&"]: for opt, count in Counter(all_options).items(): if count > 1: - parent_name = parent or 'and' - problem_text = f"Option '{opt}' occurs {count} times in a '{parent_name}' block. "\ + parent_name = parent or "and" + problem_text = ( + f"Option '{opt}' occurs {count} times in a '{parent_name}' block. " f"It should occur at max one times. This is the block:\n{printer.pformat(logic)}" + ) if parent is None: problem_text += "\nRemember a list without a parent is considered an 'and' block." problems.append(problem_text) @@ -610,43 +735,64 @@ def get_logic_problems(logic, parent=None): elif isinstance(logic, dict): assert len(logic) == 1 for key, val in logic.items(): - if key not in ['or', 'and', 'not']: - raise ValidationError(f"Invalid key {key}. Only 'or', 'and' and 'not' is allowed.") + if key not in ["or", "and", "not"]: + raise ValidationError( + f"Invalid key {key}. Only 'or', 'and' and 'not' is allowed." + ) return get_logic_problems(val, parent=key) elif isinstance(logic, str): - if '&&' not in logic: + if "&&" not in logic: return [] - entries = logic.split('&&') + entries = logic.split("&&") return get_logic_problems(entries, parent="&&") else: - raise ValidationError(f"Invalid logic element {logic} with type {type(logic)}") + raise ValidationError( + f"Invalid logic element {logic} with type {type(logic)}" + ) all_problems = [] - if 'upgrades' in cfg: - for upgrade_count, upgrade in enumerate(cfg['upgrades']): - upgrade_name = upgrade.get('upgrade_name', '') - source_str_upgrade = f"upgrade '{upgrade_name}' (Upgrade Number:{upgrade_count})" - for option_count, option in enumerate(upgrade['options']): - option_name = option.get('option', '') - source_str_option = source_str_upgrade + f", option '{option_name}' (Option Number:{option_count})" - if 'apply_logic' in option: - if problems := get_logic_problems(option['apply_logic']): - all_problems.append((source_str_option, problems, option['apply_logic'])) - - if 'package_apply_logic' in upgrade: + if "upgrades" in cfg: + for upgrade_count, upgrade in enumerate(cfg["upgrades"]): + upgrade_name = upgrade.get("upgrade_name", "") + source_str_upgrade = ( + f"upgrade '{upgrade_name}' (Upgrade Number:{upgrade_count})" + ) + for option_count, option in enumerate(upgrade["options"]): + option_name = option.get("option", "") + source_str_option = ( + source_str_upgrade + + f", option '{option_name}' (Option Number:{option_count})" + ) + if "apply_logic" in option: + if problems := get_logic_problems(option["apply_logic"]): + all_problems.append( + (source_str_option, problems, option["apply_logic"]) + ) + + if "package_apply_logic" in upgrade: source_str_package = source_str_upgrade + ", in package_apply_logic" - if problems := get_logic_problems(upgrade['package_apply_logic']): - all_problems.append((source_str_package, problems, upgrade['package_apply_logic'])) + if problems := get_logic_problems(upgrade["package_apply_logic"]): + all_problems.append( + ( + source_str_package, + problems, + upgrade["package_apply_logic"], + ) + ) # TODO: refactor this into Sampler.validate_args - if 'downselect' in cfg or "downselect" in cfg.get('sampler', {}).get('type'): + if "downselect" in cfg or "downselect" in cfg.get("sampler", {}).get("type"): source_str = "in downselect logic" - logic = cfg['downselect']['logic'] if 'downselect' in cfg else cfg['sampler']['args']['logic'] + logic = ( + cfg["downselect"]["logic"] + if "downselect" in cfg + else cfg["sampler"]["args"]["logic"] + ) if problems := get_logic_problems(logic): all_problems.append((source_str, problems, logic)) if all_problems: - error_str = '' + error_str = "" for location, problems, logic in all_problems: error_str += f"There are following problems in {location} with this logic\n{printer.pformat(logic)}\n" problem_str = "\n".join(problems) @@ -664,15 +810,15 @@ def validate_measure_references(project_file): cfg = get_project_configuration(project_file) measure_dirs = set() buildstock_dir = BuildStockBatchBase.get_buildstock_dir(project_file, cfg) - options_lookup_path = f'{buildstock_dir}/resources/options_lookup.tsv' + options_lookup_path = f"{buildstock_dir}/resources/options_lookup.tsv" # fill in the param_option_dict with {'param1':['valid_option1','valid_option2' ...]} from options_lookup.tsv try: - with open(options_lookup_path, 'r') as f: - options = csv.DictReader(f, delimiter='\t') + with open(options_lookup_path, "r") as f: + options = csv.DictReader(f, delimiter="\t") for row in options: - if row['Measure Dir']: - measure_dirs.add(row['Measure Dir']) + if row["Measure Dir"]: + measure_dirs.add(row["Measure Dir"]) except FileNotFoundError as err: logger.error(f"Options lookup file not found at: '{options_lookup_path}'") raise err @@ -689,25 +835,29 @@ def get_errors(source_str, measure_str): """ if measure_str not in measure_dirs: closest = difflib.get_close_matches(measure_str, list(measure_dirs)) - return f"Measure directory {measure_str} not found. Closest matches: {closest}" \ + return ( + f"Measure directory {measure_str} not found. Closest matches: {closest}" f" {source_str}\n" - return '' + ) + return "" source_measures_str_list = [] - if 'measures_to_ignore' in cfg['baseline']: + if "measures_to_ignore" in cfg["baseline"]: source_str = "In baseline 'measures_to_ignore'" - for measure_str in cfg['baseline']['measures_to_ignore']: + for measure_str in cfg["baseline"]["measures_to_ignore"]: source_measures_str_list.append((source_str, measure_str)) - error_message = '' + error_message = "" for source_str, measure_str in source_measures_str_list: error_message += get_errors(source_str, measure_str) if not error_message: return True else: - error_message = 'Measure name(s)/directory(ies) is(are) invalid. \n' + error_message + error_message = ( + "Measure name(s)/directory(ies) is(are) invalid. \n" + error_message + ) logger.error(error_message) raise ValidationError(error_message) @@ -720,19 +870,23 @@ def validate_reference_scenario(project_file): # collect all upgrade_names upgrade_names = set() - for upgrade_count, upgrade in enumerate(cfg.get('upgrades', [])): - upgrade_names.add(upgrade.get('upgrade_name', '')) + for upgrade_count, upgrade in enumerate(cfg.get("upgrades", [])): + upgrade_names.add(upgrade.get("upgrade_name", "")) warning_string = "" # check if the reference_scenario matches with any upgrade_names - for upgrade_count, upgrade in enumerate(cfg.get('upgrades', [])): - if 'reference_scenario' in upgrade: - if upgrade['reference_scenario'] not in upgrade_names: - warning_string += f"* In Upgrade '{upgrade.get('upgrade_name', '')}', reference_scenario: " \ + for upgrade_count, upgrade in enumerate(cfg.get("upgrades", [])): + if "reference_scenario" in upgrade: + if upgrade["reference_scenario"] not in upgrade_names: + warning_string += ( + f"* In Upgrade '{upgrade.get('upgrade_name', '')}', reference_scenario: " f"'{upgrade['reference_scenario']}' does not match any existing upgrade names \n" - elif upgrade['reference_scenario'] == upgrade.get('upgrade_name', ''): - warning_string += f"* In Upgrade '{upgrade.get('upgrade_name', '')}', reference_scenario: " \ + ) + elif upgrade["reference_scenario"] == upgrade.get("upgrade_name", ""): + warning_string += ( + f"* In Upgrade '{upgrade.get('upgrade_name', '')}', reference_scenario: " f"'{upgrade['reference_scenario']}' points to the same upgrade \n" + ) if warning_string: logger.warning(warning_string) @@ -746,22 +900,30 @@ def validate_resstock_or_comstock_version(project_file): """ cfg = get_project_configuration(project_file) - buildstock_rb = os.path.join(cfg['buildstock_directory'], 'resources/buildstock.rb') + buildstock_rb = os.path.join( + cfg["buildstock_directory"], "resources/buildstock.rb" + ) if os.path.exists(buildstock_rb): - with open(buildstock_rb, 'r') as f: + with open(buildstock_rb, "r") as f: versions = dict( - re.findall(r"^\s*(ResStock|ComStock|BuildStockBatch)_Version\s*=\s*'(.+)'", f.read(), re.MULTILINE) + re.findall( + r"^\s*(ResStock|ComStock|BuildStockBatch)_Version\s*=\s*'(.+)'", + f.read(), + re.MULTILINE, + ) ) - BuildStockBatch_Version = semver.Version.parse(versions['BuildStockBatch']) + BuildStockBatch_Version = semver.Version.parse(versions["BuildStockBatch"]) if bsb_version < BuildStockBatch_Version: - if 'ResStock' in versions.keys(): - stock_version = versions['ResStock'] - elif 'ComStock' in versions.keys(): - stock_version = versions['ComStock'] + if "ResStock" in versions.keys(): + stock_version = versions["ResStock"] + elif "ComStock" in versions.keys(): + stock_version = versions["ComStock"] else: - stock_version = 'Unknown' - val_err = f"BuildStockBatch version {BuildStockBatch_Version} or above is required" \ + stock_version = "Unknown" + val_err = ( + f"BuildStockBatch version {BuildStockBatch_Version} or above is required" f" for ResStock or ComStock version {stock_version}. Found {bsb_version}" + ) raise ValidationError(val_err) return True @@ -790,10 +952,14 @@ def validate_number_of_options(project_file): if m_option: option_number = int(m_option.group(1)) n_options_in_measure = max(option_number, n_options_in_measure) - m_costs = re.match(r"^option_(\d+)_cost_(\d+)_value", str(argument.name)) + m_costs = re.match( + r"^option_(\d+)_cost_(\d+)_value", str(argument.name) + ) if m_costs: cost_number = int(m_costs.group(2)) - n_costs_per_option_in_measure = max(cost_number, n_costs_per_option_in_measure) + n_costs_per_option_in_measure = max( + cost_number, n_costs_per_option_in_measure + ) n_options_in_cfg = 0 n_costs_in_cfg = 0 for upgrade in cfg.get("upgrades", []): @@ -828,24 +994,26 @@ def validate_openstudio_version(project_file): """ cfg = get_project_configuration(project_file) - os_version = cfg.get('os_version', BuildStockBatchBase.DEFAULT_OS_VERSION) - version_path = 'resources/hpxml-measures/HPXMLtoOpenStudio/resources/version.rb' - version_rb = os.path.join(cfg['buildstock_directory'], version_path) + os_version = cfg.get("os_version", BuildStockBatchBase.DEFAULT_OS_VERSION) + version_path = "resources/hpxml-measures/HPXMLtoOpenStudio/resources/version.rb" + version_rb = os.path.join(cfg["buildstock_directory"], version_path) if os.path.exists(version_rb): versions = {} - with open(version_rb, 'r') as f: + with open(version_rb, "r") as f: for line in f: line = line.strip() - for tool in ['OS_HPXML_Version', 'OS_Version']: + for tool in ["OS_HPXML_Version", "OS_Version"]: if line.startswith(tool): - lhs, rhs = line.split('=') - version, _ = rhs.split('#') + lhs, rhs = line.split("=") + version, _ = rhs.split("#") versions[tool] = eval(version.strip()) - OS_HPXML_Version = versions['OS_HPXML_Version'] - OS_Version = versions['OS_Version'] + OS_HPXML_Version = versions["OS_HPXML_Version"] + OS_Version = versions["OS_Version"] if not os_version.startswith(OS_Version): - val_err = f"OS version {OS_Version} is required" \ + val_err = ( + f"OS version {OS_Version} is required" f" for OS-HPXML version {OS_HPXML_Version}. Found {os_version}" + ) raise ValidationError(val_err) return True @@ -867,27 +1035,37 @@ def process_results(self, skip_combine=False, use_dask_cluster=True): self.get_dask_client() # noqa F841 try: - wfg_args = self.cfg['workflow_generator'].get('args', {}) - if self.cfg['workflow_generator']['type'] == 'residential_hpxml': - if 'simulation_output_report' in wfg_args.keys(): - if 'timeseries_frequency' in wfg_args['simulation_output_report'].keys(): - do_timeseries = wfg_args['simulation_output_report']['timeseries_frequency'] != 'none' + wfg_args = self.cfg["workflow_generator"].get("args", {}) + if self.cfg["workflow_generator"]["type"] == "residential_hpxml": + if "simulation_output_report" in wfg_args.keys(): + if ( + "timeseries_frequency" + in wfg_args["simulation_output_report"].keys() + ): + do_timeseries = ( + wfg_args["simulation_output_report"]["timeseries_frequency"] + != "none" + ) else: - do_timeseries = 'timeseries_csv_export' in wfg_args.keys() + do_timeseries = "timeseries_csv_export" in wfg_args.keys() fs = self.get_fs() if not skip_combine: - postprocessing.combine_results(fs, self.results_dir, self.cfg, do_timeseries=do_timeseries) + postprocessing.combine_results( + fs, self.results_dir, self.cfg, do_timeseries=do_timeseries + ) - aws_conf = self.cfg.get('postprocessing', {}).get('aws', {}) - if 's3' in aws_conf or 'aws' in self.cfg: - s3_bucket, s3_prefix = self.upload_results(aws_conf, self.output_dir, self.results_dir, self.sampler.csv_path) - if 'athena' in aws_conf: + aws_conf = self.cfg.get("postprocessing", {}).get("aws", {}) + if "s3" in aws_conf or "aws" in self.cfg: + s3_bucket, s3_prefix = self.upload_results( + aws_conf, self.output_dir, self.results_dir, self.sampler.csv_path + ) + if "athena" in aws_conf: postprocessing.create_athena_tables( aws_conf, os.path.basename(self.output_dir), s3_bucket, - s3_prefix + s3_prefix, ) finally: if use_dask_cluster: diff --git a/buildstockbatch/eagle.py b/buildstockbatch/eagle.py index 8a21f1d7..9245bce6 100644 --- a/buildstockbatch/eagle.py +++ b/buildstockbatch/eagle.py @@ -42,7 +42,7 @@ ContainerRuntime, path_rel_to_file, get_project_configuration, - read_csv + read_csv, ) from buildstockbatch import postprocessing from buildstockbatch.__version__ import __version__ as bsb_version @@ -52,34 +52,35 @@ def get_bool_env_var(varname): - return os.environ.get(varname, '0').lower() in ('true', 't', '1', 'y', 'yes') + return os.environ.get(varname, "0").lower() in ("true", "t", "1", "y", "yes") class EagleBatch(BuildStockBatchBase): CONTAINER_RUNTIME = ContainerRuntime.SINGULARITY - DEFAULT_SYS_IMAGE_DIR = '/shared-projects/buildstock/singularity_images' - hpc_name = 'eagle' + DEFAULT_SYS_IMAGE_DIR = "/shared-projects/buildstock/singularity_images" + hpc_name = "eagle" min_sims_per_job = 36 * 2 - local_scratch = pathlib.Path(os.environ.get('LOCAL_SCRATCH', '/tmp/scratch')) - local_project_dir = local_scratch / 'project' - local_buildstock_dir = local_scratch / 'buildstock' - local_weather_dir = local_scratch / 'weather' - local_output_dir = local_scratch / 'output' - local_singularity_img = local_scratch / 'openstudio.simg' - local_housing_characteristics_dir = local_scratch / 'housing_characteristics' + local_scratch = pathlib.Path(os.environ.get("LOCAL_SCRATCH", "/tmp/scratch")) + local_project_dir = local_scratch / "project" + local_buildstock_dir = local_scratch / "buildstock" + local_weather_dir = local_scratch / "weather" + local_output_dir = local_scratch / "output" + local_singularity_img = local_scratch / "openstudio.simg" + local_housing_characteristics_dir = local_scratch / "housing_characteristics" def __init__(self, project_filename): super().__init__(project_filename) output_dir = pathlib.Path(self.output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) - logger.debug('Output directory = {}'.format(output_dir)) + logger.debug("Output directory = {}".format(output_dir)) weather_dir = self.weather_dir # noqa E841 - self.singularity_image = self.get_singularity_image(self.cfg, self.os_version, self.os_sha) - + self.singularity_image = self.get_singularity_image( + self.cfg, self.os_version, self.os_sha + ) @classmethod def validate_project(cls, project_file): @@ -93,18 +94,20 @@ def validate_project(cls, project_file): @classmethod def validate_output_directory_eagle(cls, project_file): cfg = get_project_configuration(project_file) - output_dir = path_rel_to_file(project_file, cfg['output_directory']) + output_dir = path_rel_to_file(project_file, cfg["output_directory"]) if not re.match(r"/(lustre/eaglefs/)?(scratch|projects)", output_dir): - raise ValidationError(f"`output_directory` must be in /scratch or /projects," - f" `output_directory` = {output_dir}") + raise ValidationError( + f"`output_directory` must be in /scratch or /projects," + f" `output_directory` = {output_dir}" + ) @classmethod def validate_singularity_image_eagle(cls, project_file): cfg = get_project_configuration(project_file) singularity_image = cls.get_singularity_image( cfg, - cfg.get('os_version', cls.DEFAULT_OS_VERSION), - cfg.get('os_sha', cls.DEFAULT_OS_SHA) + cfg.get("os_version", cls.DEFAULT_OS_VERSION), + cfg.get("os_sha", cls.DEFAULT_OS_SHA), ) if not os.path.exists(singularity_image): raise ValidationError( @@ -113,12 +116,14 @@ def validate_singularity_image_eagle(cls, project_file): @property def output_dir(self): - output_dir = path_rel_to_file(self.project_filename, self.cfg['output_directory']) + output_dir = path_rel_to_file( + self.project_filename, self.cfg["output_directory"] + ) return output_dir @property def results_dir(self): - results_dir = os.path.join(self.output_dir, 'results') + results_dir = os.path.join(self.output_dir, "results") assert os.path.isdir(results_dir) return results_dir @@ -131,16 +136,15 @@ def clear_and_copy_dir(src, dst): @classmethod def get_singularity_image(cls, cfg, os_version, os_sha): return os.path.join( - cfg.get('sys_image_dir', cls.DEFAULT_SYS_IMAGE_DIR), - 'OpenStudio-{ver}.{sha}-Singularity.simg'.format( - ver=os_version, - sha=os_sha - ) + cfg.get("sys_image_dir", cls.DEFAULT_SYS_IMAGE_DIR), + "OpenStudio-{ver}.{sha}-Singularity.simg".format( + ver=os_version, sha=os_sha + ), ) @property def weather_dir(self): - weather_dir = os.path.join(self.output_dir, 'weather') + weather_dir = os.path.join(self.output_dir, "weather") if not os.path.exists(weather_dir): os.makedirs(weather_dir) self._get_weather_files() @@ -149,10 +153,15 @@ def weather_dir(self): def run_batch(self, sampling_only=False): # Create simulation_output dir - sim_out_ts_dir = pathlib.Path(self.output_dir) / 'results' / 'simulation_output' / 'timeseries' + sim_out_ts_dir = ( + pathlib.Path(self.output_dir) + / "results" + / "simulation_output" + / "timeseries" + ) os.makedirs(sim_out_ts_dir, exist_ok=True) - for i in range(0, len(self.cfg.get('upgrades', [])) + 1): - os.makedirs(sim_out_ts_dir / f'up{i:02d}') + for i in range(0, len(self.cfg.get("upgrades", [])) + 1): + os.makedirs(sim_out_ts_dir / f"up{i:02d}") # create destination_dir and copy housing_characteristics into it logger.debug("Copying housing characteristics") @@ -160,8 +169,7 @@ def run_batch(self, sampling_only=False): if os.path.exists(destination_dir): shutil.rmtree(destination_dir) shutil.copytree( - os.path.join(self.project_dir, 'housing_characteristics'), - destination_dir + os.path.join(self.project_dir, "housing_characteristics"), destination_dir ) logger.debug("Housing characteristics copied.") @@ -183,16 +191,18 @@ def run_batch(self, sampling_only=False): building_ids = df.index.tolist() n_datapoints = len(building_ids) # number of simulations is number of buildings * number of upgrades - n_sims = n_datapoints * (len(self.cfg.get('upgrades', [])) + 1) + n_sims = n_datapoints * (len(self.cfg.get("upgrades", [])) + 1) # this is the number of simulations defined for this run as a "full job" # number of simulations per job if we believe the .yml file n_jobs - n_sims_per_job = math.ceil(n_sims / self.cfg[self.hpc_name]['n_jobs']) + n_sims_per_job = math.ceil(n_sims / self.cfg[self.hpc_name]["n_jobs"]) # use more appropriate batch size in the case of n_jobs being much # larger than we need, now that we know n_sims n_sims_per_job = max(n_sims_per_job, self.min_sims_per_job) - upgrade_sims = itertools.product(building_ids, range(len(self.cfg.get('upgrades', [])))) + upgrade_sims = itertools.product( + building_ids, range(len(self.cfg.get("upgrades", []))) + ) if not self.skip_baseline_sims: # create batches of simulations baseline_sims = zip(building_ids, itertools.repeat(None)) @@ -206,76 +216,89 @@ def run_batch(self, sampling_only=False): batch = list(itertools.islice(all_sims_iter, n_sims_per_job)) if not batch: break - logger.info('Queueing job {} ({} simulations)'.format(i, len(batch))) - job_json_filename = os.path.join(self.output_dir, 'job{:03d}.json'.format(i)) - with open(job_json_filename, 'w') as f: - json.dump({ - 'job_num': i, - 'n_datapoints': n_datapoints, - 'batch': batch, - }, f, indent=4) + logger.info("Queueing job {} ({} simulations)".format(i, len(batch))) + job_json_filename = os.path.join( + self.output_dir, "job{:03d}.json".format(i) + ) + with open(job_json_filename, "w") as f: + json.dump( + { + "job_num": i, + "n_datapoints": n_datapoints, + "batch": batch, + }, + f, + indent=4, + ) # now queue them jobids = self.queue_jobs() # queue up post-processing to run after all the simulation jobs are complete - if not get_bool_env_var('MEASURESONLY'): + if not get_bool_env_var("MEASURESONLY"): self.queue_post_processing(jobids) def run_job_batch(self, job_array_number): self.clear_and_copy_dir( - pathlib.Path(self.buildstock_dir) / 'resources', - self.local_buildstock_dir / 'resources' + pathlib.Path(self.buildstock_dir) / "resources", + self.local_buildstock_dir / "resources", ) self.clear_and_copy_dir( - pathlib.Path(self.buildstock_dir) / 'measures', - self.local_buildstock_dir / 'measures' + pathlib.Path(self.buildstock_dir) / "measures", + self.local_buildstock_dir / "measures", ) - if os.path.exists(pathlib.Path(self.buildstock_dir) / 'resources/hpxml-measures'): + if os.path.exists( + pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures" + ): self.clear_and_copy_dir( - pathlib.Path(self.buildstock_dir) / 'resources/hpxml-measures', - self.local_buildstock_dir / 'resources/hpxml-measures' + pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures", + self.local_buildstock_dir / "resources/hpxml-measures", ) + self.clear_and_copy_dir(self.weather_dir, self.local_weather_dir) self.clear_and_copy_dir( - self.weather_dir, - self.local_weather_dir - ) - self.clear_and_copy_dir( - pathlib.Path(self.output_dir) / 'housing_characteristics', - self.local_housing_characteristics_dir + pathlib.Path(self.output_dir) / "housing_characteristics", + self.local_housing_characteristics_dir, ) if os.path.exists(self.local_singularity_img): os.remove(self.local_singularity_img) shutil.copy2(self.singularity_image, self.local_singularity_img) # Run the job batch as normal - job_json_filename = os.path.join(self.output_dir, 'job{:03d}.json'.format(job_array_number)) - with open(job_json_filename, 'r') as f: + job_json_filename = os.path.join( + self.output_dir, "job{:03d}.json".format(job_array_number) + ) + with open(job_json_filename, "r") as f: args = json.load(f) # trim the buildstock.csv file to only include rows for current batch. Helps speed up simulation logger.debug("Trimming buildstock.csv") - building_ids = {x[0] for x in args['batch']} - buildstock_csv_path = self.local_housing_characteristics_dir / 'buildstock.csv' + building_ids = {x[0] for x in args["batch"]} + buildstock_csv_path = self.local_housing_characteristics_dir / "buildstock.csv" valid_rows = [] - with open(buildstock_csv_path, 'r', encoding='utf-8') as f: + with open(buildstock_csv_path, "r", encoding="utf-8") as f: csv_reader = csv.DictReader(f) for row in csv_reader: - if int(row['Building']) in building_ids: + if int(row["Building"]) in building_ids: valid_rows.append(row) df = pd.DataFrame.from_records(valid_rows) df.to_csv(buildstock_csv_path, index=False) logger.debug(f"Buildstock.csv trimmed to {len(df)} rows.") - traceback_file_path = self.local_output_dir / 'simulation_output' / f'traceback{job_array_number}.out' + traceback_file_path = ( + self.local_output_dir + / "simulation_output" + / f"traceback{job_array_number}.out" + ) @delayed def run_building_d(i, upgrade_idx): try: - return self.run_building(self.output_dir, self.cfg, args['n_datapoints'], i, upgrade_idx) + return self.run_building( + self.output_dir, self.cfg, args["n_datapoints"], i, upgrade_idx + ) except Exception: - with open(traceback_file_path, 'a') as f: + with open(traceback_file_path, "a") as f: txt = get_error_details() txt = "\n" + "#" * 20 + "\n" + f"Traceback for building{i}\n" + txt f.write(txt) @@ -286,39 +309,44 @@ def run_building_d(i, upgrade_idx): # Run the simulations, get the data_point_out.json info from each tick = time.time() with Parallel(n_jobs=-1, verbose=9) as parallel: - dpouts = parallel(itertools.starmap(run_building_d, args['batch'])) + dpouts = parallel(itertools.starmap(run_building_d, args["batch"])) tick = time.time() - tick - logger.info('Simulation time: {:.2f} minutes'.format(tick / 60.)) + logger.info("Simulation time: {:.2f} minutes".format(tick / 60.0)) # Save the aggregated dpouts as a json file - lustre_sim_out_dir = pathlib.Path(self.results_dir) / 'simulation_output' - results_json = lustre_sim_out_dir / f'results_job{job_array_number}.json.gz' - logger.info(f'Writing results to {results_json}') - with gzip.open(results_json, 'wt', encoding='utf-8') as f: + lustre_sim_out_dir = pathlib.Path(self.results_dir) / "simulation_output" + results_json = lustre_sim_out_dir / f"results_job{job_array_number}.json.gz" + logger.info(f"Writing results to {results_json}") + with gzip.open(results_json, "wt", encoding="utf-8") as f: json.dump(dpouts, f) # Compress simulation results - if self.cfg.get('max_minutes_per_sim') is not None: + if self.cfg.get("max_minutes_per_sim") is not None: time.sleep(60) # Allow results JSON to finish writing - simout_filename = lustre_sim_out_dir / f'simulations_job{job_array_number}.tar.gz' - logger.info(f'Compressing simulation outputs to {simout_filename}') - local_sim_out_dir = self.local_output_dir / 'simulation_output' + simout_filename = ( + lustre_sim_out_dir / f"simulations_job{job_array_number}.tar.gz" + ) + logger.info(f"Compressing simulation outputs to {simout_filename}") + local_sim_out_dir = self.local_output_dir / "simulation_output" subprocess.run( [ - 'tar', - 'cf', str(simout_filename), - '-I', 'pigz', - '-C', str(local_sim_out_dir), - '.' + "tar", + "cf", + str(simout_filename), + "-I", + "pigz", + "-C", + str(local_sim_out_dir), + ".", ], - check=True + check=True, ) # copy the tracebacks if it exists if os.path.exists(traceback_file_path): shutil.copy2(traceback_file_path, lustre_sim_out_dir) - logger.info('batch complete') + logger.info("batch complete") @classmethod def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): @@ -327,114 +355,145 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1 try: - sim_id, sim_dir = cls.make_sim_dir(i, upgrade_idx, os.path.join(cls.local_output_dir, 'simulation_output')) + sim_id, sim_dir = cls.make_sim_dir( + i, upgrade_idx, os.path.join(cls.local_output_dir, "simulation_output") + ) except SimulationExists as ex: sim_dir = ex.sim_dir else: # Generate the osw for this simulation - osw = cls.create_osw(cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx) - with open(os.path.join(sim_dir, 'in.osw'), 'w') as f: + osw = cls.create_osw( + cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx + ) + with open(os.path.join(sim_dir, "in.osw"), "w") as f: json.dump(osw, f, indent=4) # Copy other necessary stuff into the simulation directory dirs_to_mount = [ - os.path.join(cls.local_buildstock_dir, 'measures'), + os.path.join(cls.local_buildstock_dir, "measures"), cls.local_weather_dir, ] # Create a temporary directory for the simulation to use - with tempfile.TemporaryDirectory(dir=cls.local_scratch, prefix=f"{sim_id}_") as tmpdir: + with tempfile.TemporaryDirectory( + dir=cls.local_scratch, prefix=f"{sim_id}_" + ) as tmpdir: # Build the command to instantiate and configure the singularity container the simulation is run inside - local_resources_dir = cls.local_buildstock_dir / 'resources' + local_resources_dir = cls.local_buildstock_dir / "resources" args = [ - 'singularity', 'exec', - '--contain', - '-e', - '--pwd', '/var/simdata/openstudio', - '-B', f'{sim_dir}:/var/simdata/openstudio', - '-B', f'{local_resources_dir}:/lib/resources', - '-B', f'{cls.local_housing_characteristics_dir}:/lib/housing_characteristics', - '-B', f'{tmpdir}:/tmp' - ] - runscript = [ - 'ln -s /lib /var/simdata/openstudio/lib' + "singularity", + "exec", + "--contain", + "-e", + "--pwd", + "/var/simdata/openstudio", + "-B", + f"{sim_dir}:/var/simdata/openstudio", + "-B", + f"{local_resources_dir}:/lib/resources", + "-B", + f"{cls.local_housing_characteristics_dir}:/lib/housing_characteristics", + "-B", + f"{tmpdir}:/tmp", ] + runscript = ["ln -s /lib /var/simdata/openstudio/lib"] for src in dirs_to_mount: - container_mount = '/' + os.path.basename(src) - args.extend(['-B', '{}:{}:ro'.format(src, container_mount)]) - container_symlink = os.path.join('/var/simdata/openstudio', os.path.basename(src)) - runscript.append('ln -s {} {}'.format(*map(shlex.quote, (container_mount, container_symlink)))) + container_mount = "/" + os.path.basename(src) + args.extend(["-B", "{}:{}:ro".format(src, container_mount)]) + container_symlink = os.path.join( + "/var/simdata/openstudio", os.path.basename(src) + ) + runscript.append( + "ln -s {} {}".format( + *map(shlex.quote, (container_mount, container_symlink)) + ) + ) - if os.path.exists(os.path.join(cls.local_buildstock_dir, 'resources/hpxml-measures')): - runscript.append('ln -s /resources /var/simdata/openstudio/resources') - src = os.path.join(cls.local_buildstock_dir, 'resources/hpxml-measures') - container_mount = '/resources/hpxml-measures' - args.extend(['-B', '{}:{}:ro'.format(src, container_mount)]) + if os.path.exists( + os.path.join(cls.local_buildstock_dir, "resources/hpxml-measures") + ): + runscript.append( + "ln -s /resources /var/simdata/openstudio/resources" + ) + src = os.path.join( + cls.local_buildstock_dir, "resources/hpxml-measures" + ) + container_mount = "/resources/hpxml-measures" + args.extend(["-B", "{}:{}:ro".format(src, container_mount)]) # Build the openstudio command that will be issued within the # singularity container If custom gems are to be used in the # singularity container add extra bundle arguments to the cli # command - cli_cmd = 'openstudio run -w in.osw' - if cfg.get('baseline', dict()).get('custom_gems', False): + cli_cmd = "openstudio run -w in.osw" + if cfg.get("baseline", dict()).get("custom_gems", False): cli_cmd = ( - 'openstudio --bundle /var/oscli/Gemfile --bundle_path /var/oscli/gems ' - '--bundle_without native_ext run -w in.osw --debug' + "openstudio --bundle /var/oscli/Gemfile --bundle_path /var/oscli/gems " + "--bundle_without native_ext run -w in.osw --debug" ) - if get_bool_env_var('MEASURESONLY'): - cli_cmd += ' --measures_only' + if get_bool_env_var("MEASURESONLY"): + cli_cmd += " --measures_only" runscript.append(cli_cmd) - args.extend([ - str(cls.local_singularity_img), - 'bash', '-x' - ]) + args.extend([str(cls.local_singularity_img), "bash", "-x"]) env_vars = dict(os.environ) - env_vars['SINGULARITYENV_BUILDSTOCKBATCH_VERSION'] = bsb_version - logger.debug('\n'.join(map(str, args))) - max_time_min = cfg.get('max_minutes_per_sim') + env_vars["SINGULARITYENV_BUILDSTOCKBATCH_VERSION"] = bsb_version + logger.debug("\n".join(map(str, args))) + max_time_min = cfg.get("max_minutes_per_sim") if max_time_min is not None: subprocess_kw = {"timeout": max_time_min * 60} else: subprocess_kw = {} start_time = dt.datetime.now() - with open(os.path.join(sim_dir, 'openstudio_output.log'), 'w') as f_out: + with open(os.path.join(sim_dir, "openstudio_output.log"), "w") as f_out: try: subprocess.run( args, check=True, - input='\n'.join(runscript).encode('utf-8'), + input="\n".join(runscript).encode("utf-8"), stdout=f_out, stderr=subprocess.STDOUT, cwd=cls.local_output_dir, env=env_vars, - **subprocess_kw + **subprocess_kw, ) except subprocess.TimeoutExpired: end_time = dt.datetime.now() - msg = f'Terminated {sim_id} after reaching max time of {max_time_min} minutes' - f_out.write(f'[{end_time.now()} ERROR] {msg}') + msg = f"Terminated {sim_id} after reaching max time of {max_time_min} minutes" + f_out.write(f"[{end_time.now()} ERROR] {msg}") logger.warning(msg) - with open(os.path.join(sim_dir, 'out.osw'), 'w') as out_osw: + with open(os.path.join(sim_dir, "out.osw"), "w") as out_osw: out_msg = { - 'started_at': start_time.strftime('%Y%m%dT%H%M%SZ'), - 'completed_at': end_time.strftime('%Y%m%dT%H%M%SZ'), - 'completed_status': 'Fail', - 'timeout': msg + "started_at": start_time.strftime("%Y%m%dT%H%M%SZ"), + "completed_at": end_time.strftime("%Y%m%dT%H%M%SZ"), + "completed_status": "Fail", + "timeout": msg, } out_osw.write(json.dumps(out_msg, indent=3)) - with open(os.path.join(sim_dir, 'run', 'out.osw'), 'a') as run_log: - run_log.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") - with open(os.path.join(sim_dir, 'run', 'failed.job'), 'w') as failed_job: - failed_job.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") - time.sleep(60) # Wait for EnergyPlus to release file locks and data_point.zip to finish + with open( + os.path.join(sim_dir, "run", "out.osw"), "a" + ) as run_log: + run_log.write( + f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}" + ) + with open( + os.path.join(sim_dir, "run", "failed.job"), "w" + ) as failed_job: + failed_job.write( + f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}" + ) + time.sleep( + 60 + ) # Wait for EnergyPlus to release file locks and data_point.zip to finish except subprocess.CalledProcessError: pass finally: # Clean up the symbolic links we created in the container - for mount_dir in dirs_to_mount + [os.path.join(sim_dir, 'lib')]: + for mount_dir in dirs_to_mount + [os.path.join(sim_dir, "lib")]: try: - os.unlink(os.path.join(sim_dir, os.path.basename(mount_dir))) + os.unlink( + os.path.join(sim_dir, os.path.basename(mount_dir)) + ) except FileNotFoundError: pass @@ -442,166 +501,195 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): cls.cleanup_sim_dir( sim_dir, fs, - f'{output_dir}/results/simulation_output/timeseries', + f"{output_dir}/results/simulation_output/timeseries", upgrade_id, - i + i, ) reporting_measures = cls.get_reporting_measures(cfg) - dpout = postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, i) + dpout = postprocessing.read_simulation_outputs( + fs, reporting_measures, sim_dir, upgrade_id, i + ) return dpout def queue_jobs(self, array_ids=None, hipri=False): - eagle_cfg = self.cfg['eagle'] - with open(os.path.join(self.output_dir, 'job001.json'), 'r') as f: + eagle_cfg = self.cfg["eagle"] + with open(os.path.join(self.output_dir, "job001.json"), "r") as f: job_json = json.load(f) - n_sims_per_job = len(job_json['batch']) + n_sims_per_job = len(job_json["batch"]) del job_json if array_ids: - array_spec = ','.join(map(str, array_ids)) + array_spec = ",".join(map(str, array_ids)) else: - jobjson_re = re.compile(r'job(\d+).json') - array_max = max(map( - lambda m: int(m.group(1)), - filter(lambda m: m is not None, map(jobjson_re.match, (os.listdir(self.output_dir)))) - )) - array_spec = '1-{}'.format(array_max) - account = eagle_cfg['account'] + jobjson_re = re.compile(r"job(\d+).json") + array_max = max( + map( + lambda m: int(m.group(1)), + filter( + lambda m: m is not None, + map(jobjson_re.match, (os.listdir(self.output_dir))), + ), + ) + ) + array_spec = "1-{}".format(array_max) + account = eagle_cfg["account"] # Estimate the wall time in minutes cores_per_node = 36 - minutes_per_sim = eagle_cfg['minutes_per_sim'] - walltime = math.ceil(math.ceil(n_sims_per_job / cores_per_node) * minutes_per_sim) + minutes_per_sim = eagle_cfg["minutes_per_sim"] + walltime = math.ceil( + math.ceil(n_sims_per_job / cores_per_node) * minutes_per_sim + ) # Queue up simulations here = os.path.dirname(os.path.abspath(__file__)) - eagle_sh = os.path.join(here, 'eagle.sh') + eagle_sh = os.path.join(here, "eagle.sh") env = {} env.update(os.environ) - env['PROJECTFILE'] = self.project_filename - env['MY_CONDA_ENV'] = os.environ['CONDA_PREFIX'] + env["PROJECTFILE"] = self.project_filename + env["MY_CONDA_ENV"] = os.environ["CONDA_PREFIX"] args = [ - 'sbatch', - '--account={}'.format(account), - '--time={}'.format(walltime), - '--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY', - '--array={}'.format(array_spec), - '--output=job.out-%a', - '--job-name=bstk', - eagle_sh + "sbatch", + "--account={}".format(account), + "--time={}".format(walltime), + "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY", + "--array={}".format(array_spec), + "--output=job.out-%a", + "--job-name=bstk", + eagle_sh, ] - if os.environ.get('SLURM_JOB_QOS'): - args.insert(-1, '--qos={}'.format(os.environ.get('SLURM_JOB_QOS'))) + if os.environ.get("SLURM_JOB_QOS"): + args.insert(-1, "--qos={}".format(os.environ.get("SLURM_JOB_QOS"))) elif hipri: - args.insert(-1, '--qos=high') + args.insert(-1, "--qos=high") - logger.debug(' '.join(args)) + logger.debug(" ".join(args)) resp = subprocess.run( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, - encoding='utf-8', - cwd=self.output_dir + encoding="utf-8", + cwd=self.output_dir, ) try: resp.check_returncode() except subprocess.CalledProcessError as ex: logger.error(ex.stderr) raise - for line in resp.stdout.split('\n'): - logger.debug('sbatch:' + line) - m = re.search(r'Submitted batch job (\d+)', resp.stdout) + for line in resp.stdout.split("\n"): + logger.debug("sbatch:" + line) + m = re.search(r"Submitted batch job (\d+)", resp.stdout) if not m: - logger.error('Did not receive job id back from sbatch:') - raise RuntimeError('Didn\'t receive job id back from sbatch') + logger.error("Did not receive job id back from sbatch:") + raise RuntimeError("Didn't receive job id back from sbatch") job_id = m.group(1) return [job_id] def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False): # Configuration values - account = self.cfg['eagle']['account'] - walltime = self.cfg['eagle'].get('postprocessing', {}).get('time', '1:30:00') - memory = self.cfg['eagle'].get('postprocessing', {}).get('node_memory_mb', 85248) - n_procs = self.cfg['eagle'].get('postprocessing', {}).get('n_procs', 18) - n_workers = self.cfg['eagle'].get('postprocessing', {}).get('n_workers', 2) - print(f"Submitting job to {n_workers} {memory}MB memory nodes using {n_procs} cores in each.") + account = self.cfg["eagle"]["account"] + walltime = self.cfg["eagle"].get("postprocessing", {}).get("time", "1:30:00") + memory = ( + self.cfg["eagle"].get("postprocessing", {}).get("node_memory_mb", 85248) + ) + n_procs = self.cfg["eagle"].get("postprocessing", {}).get("n_procs", 18) + n_workers = self.cfg["eagle"].get("postprocessing", {}).get("n_workers", 2) + print( + f"Submitting job to {n_workers} {memory}MB memory nodes using {n_procs} cores in each." + ) # Throw an error if the files already exist. if not upload_only: - for subdir in ('parquet', 'results_csvs'): - subdirpath = pathlib.Path(self.output_dir, 'results', subdir) + for subdir in ("parquet", "results_csvs"): + subdirpath = pathlib.Path(self.output_dir, "results", subdir) if subdirpath.exists(): - raise FileExistsError(f'{subdirpath} already exists. This means you may have run postprocessing already. If you are sure you want to rerun, delete that directory and try again.') # noqa E501 + raise FileExistsError( + f"{subdirpath} already exists. This means you may have run postprocessing already. If you are sure you want to rerun, delete that directory and try again." + ) # noqa E501 # Move old output logs and config to make way for new ones - for filename in ('dask_scheduler.json', 'dask_scheduler.out', 'dask_workers.out', 'postprocessing.out'): + for filename in ( + "dask_scheduler.json", + "dask_scheduler.out", + "dask_workers.out", + "postprocessing.out", + ): filepath = pathlib.Path(self.output_dir, filename) if filepath.exists(): last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath)) shutil.move( filepath, - filepath.parent / f'{filepath.stem}_{last_mod_date:%Y%m%d%H%M}{filepath.suffix}' + filepath.parent + / f"{filepath.stem}_{last_mod_date:%Y%m%d%H%M}{filepath.suffix}", ) env = {} env.update(os.environ) - env['PROJECTFILE'] = self.project_filename - env['MY_CONDA_ENV'] = os.environ['CONDA_PREFIX'] - env['OUT_DIR'] = self.output_dir - env['UPLOADONLY'] = str(upload_only) - env['MEMORY'] = str(memory) - env['NPROCS'] = str(n_procs) + env["PROJECTFILE"] = self.project_filename + env["MY_CONDA_ENV"] = os.environ["CONDA_PREFIX"] + env["OUT_DIR"] = self.output_dir + env["UPLOADONLY"] = str(upload_only) + env["MEMORY"] = str(memory) + env["NPROCS"] = str(n_procs) here = os.path.dirname(os.path.abspath(__file__)) - eagle_post_sh = os.path.join(here, 'eagle_postprocessing.sh') + eagle_post_sh = os.path.join(here, "eagle_postprocessing.sh") args = [ - 'sbatch', - '--account={}'.format(account), - '--time={}'.format(walltime), - '--export=PROJECTFILE,MY_CONDA_ENV,OUT_DIR,UPLOADONLY,MEMORY,NPROCS', - '--job-name=bstkpost', - '--output=postprocessing.out', - '--nodes=1', - ':', - '--mem={}'.format(memory), - '--output=dask_workers.out', - '--nodes={}'.format(n_workers), - eagle_post_sh + "sbatch", + "--account={}".format(account), + "--time={}".format(walltime), + "--export=PROJECTFILE,MY_CONDA_ENV,OUT_DIR,UPLOADONLY,MEMORY,NPROCS", + "--job-name=bstkpost", + "--output=postprocessing.out", + "--nodes=1", + ":", + "--mem={}".format(memory), + "--output=dask_workers.out", + "--nodes={}".format(n_workers), + eagle_post_sh, ] if after_jobids: - args.insert(4, '--dependency=afterany:{}'.format(':'.join(after_jobids))) + args.insert(4, "--dependency=afterany:{}".format(":".join(after_jobids))) - if os.environ.get('SLURM_JOB_QOS'): - args.insert(-1, '--qos={}'.format(os.environ.get('SLURM_JOB_QOS'))) + if os.environ.get("SLURM_JOB_QOS"): + args.insert(-1, "--qos={}".format(os.environ.get("SLURM_JOB_QOS"))) elif hipri: - args.insert(-1, '--qos=high') + args.insert(-1, "--qos=high") resp = subprocess.run( args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env, - encoding='utf-8', - cwd=self.output_dir + encoding="utf-8", + cwd=self.output_dir, ) - for line in resp.stdout.split('\n'): - logger.debug('sbatch: {}'.format(line)) + for line in resp.stdout.split("\n"): + logger.debug("sbatch: {}".format(line)) def get_dask_client(self): - if get_bool_env_var('DASKLOCALCLUSTER'): - cluster = LocalCluster(local_directory='/data/dask-tmp') + if get_bool_env_var("DASKLOCALCLUSTER"): + cluster = LocalCluster(local_directory="/data/dask-tmp") return Client(cluster) else: - return Client(scheduler_file=os.path.join(self.output_dir, 'dask_scheduler.json')) + return Client( + scheduler_file=os.path.join(self.output_dir, "dask_scheduler.json") + ) def process_results(self, *args, **kwargs): # Check that all the jobs succeeded before proceeding failed_job_array_ids = self.get_failed_job_array_ids() if failed_job_array_ids: - logger.error("The following simulation jobs failed: {}".format(", ".join(map(str, failed_job_array_ids)))) - logger.error("Please inspect those jobs and fix any problems before resubmitting.") + logger.error( + "The following simulation jobs failed: {}".format( + ", ".join(map(str, failed_job_array_ids)) + ) + ) + logger.error( + "Please inspect those jobs and fix any problems before resubmitting." + ) logger.critical("Postprocessing cancelled.") return False @@ -617,11 +705,11 @@ def _get_job_ids_for_file_pattern(self, pat): return job_ids def get_failed_job_array_ids(self): - job_out_files = sorted(pathlib.Path(self.output_dir).glob('job.out-*')) + job_out_files = sorted(pathlib.Path(self.output_dir).glob("job.out-*")) failed_job_ids = set() for filename in job_out_files: - with open(filename, 'r') as f: + with open(filename, "r") as f: if not re.search(r"batch complete", f.read()): job_id = int(re.match(r"job\.out-(\d+)", filename.name).group(1)) logger.debug(f"Array Job ID {job_id} had a failure.") @@ -644,28 +732,29 @@ def rerun_failed_jobs(self, hipri=False): output_path = pathlib.Path(self.output_dir) results_path = pathlib.Path(self.results_dir) - prev_failed_job_out_dir = output_path / 'prev_failed_jobs' + prev_failed_job_out_dir = output_path / "prev_failed_jobs" os.makedirs(prev_failed_job_out_dir, exist_ok=True) for job_array_id in failed_job_array_ids: # Move the failed job.out file so it doesn't get overwritten - filepath = output_path / f'job.out-{job_array_id}' + filepath = output_path / f"job.out-{job_array_id}" if filepath.exists(): last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath)) shutil.move( filepath, - prev_failed_job_out_dir / f'{filepath.name}_{last_mod_date:%Y%m%d%H%M}' + prev_failed_job_out_dir + / f"{filepath.name}_{last_mod_date:%Y%m%d%H%M}", ) # Delete simulation results for jobs we're about to rerun - files_to_delete = [f'simulations_job{job_array_id}.tar.gz', f'results_job{job_array_id}.json.gz'] + files_to_delete = [ + f"simulations_job{job_array_id}.tar.gz", + f"results_job{job_array_id}.json.gz", + ] for filename in files_to_delete: - (results_path / 'simulation_output' / filename).unlink(missing_ok=True) + (results_path / "simulation_output" / filename).unlink(missing_ok=True) # Clear out postprocessed data so we can start from a clean slate - dirs_to_delete = [ - results_path / 'results_csvs', - results_path / 'parquet' - ] + dirs_to_delete = [results_path / "results_csvs", results_path / "parquet"] for x in dirs_to_delete: if x.exists(): shutil.rmtree(x) @@ -675,41 +764,37 @@ def rerun_failed_jobs(self, hipri=False): logging_config = { - 'version': 1, - 'disable_existing_loggers': True, - 'formatters': { - 'defaultfmt': { - 'format': '%(levelname)s:%(asctime)s:%(name)s:%(message)s', - 'datefmt': '%Y-%m-%d %H:%M:%S' - } + "version": 1, + "disable_existing_loggers": True, + "formatters": { + "defaultfmt": { + "format": "%(levelname)s:%(asctime)s:%(name)s:%(message)s", + "datefmt": "%Y-%m-%d %H:%M:%S", + } + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "defaultfmt", + "level": "DEBUG", + "stream": "ext://sys.stdout", + } + }, + "loggers": { + "__main__": {"level": "DEBUG", "propagate": True, "handlers": ["console"]}, + "buildstockbatch": { + "level": "DEBUG", + "propagate": True, + "handlers": ["console"], }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'formatter': 'defaultfmt', - 'level': 'DEBUG', - 'stream': 'ext://sys.stdout', - } - }, - 'loggers': { - '__main__': { - 'level': 'DEBUG', - 'propagate': True, - 'handlers': ['console'] - }, - 'buildstockbatch': { - 'level': 'DEBUG', - 'propagate': True, - 'handlers': ['console'] - } - }, - } + }, +} def user_cli(argv=sys.argv[1:]): - ''' + """ This is the user entry point for running buildstockbatch on Eagle - ''' + """ # set up logging, currently based on within-this-file hard-coded config logging.config.dictConfig(logging_config) @@ -718,42 +803,39 @@ def user_cli(argv=sys.argv[1:]): # CLI arguments parser = argparse.ArgumentParser() - parser.add_argument('project_filename') + parser.add_argument("project_filename") parser.add_argument( - '--hipri', - action='store_true', - help='Submit this job to the high priority queue. Uses 2x node hours.' + "--hipri", + action="store_true", + help="Submit this job to the high priority queue. Uses 2x node hours.", ) parser.add_argument( - '-m', '--measures_only', - action='store_true', - help='Only apply the measures, but don\'t run simulations. Useful for debugging.' + "-m", + "--measures_only", + action="store_true", + help="Only apply the measures, but don't run simulations. Useful for debugging.", ) group = parser.add_mutually_exclusive_group() group.add_argument( - '--postprocessonly', - help='Only do postprocessing, useful for when the simulations are already done', - action='store_true' + "--postprocessonly", + help="Only do postprocessing, useful for when the simulations are already done", + action="store_true", ) group.add_argument( - '--uploadonly', - help='Only upload to S3, useful when postprocessing is already done. Ignores the upload flag in yaml', - action='store_true' + "--uploadonly", + help="Only upload to S3, useful when postprocessing is already done. Ignores the upload flag in yaml", + action="store_true", ) group.add_argument( - '--validateonly', - help='Only validate the project YAML file and references. Nothing is executed', - action='store_true' + "--validateonly", + help="Only validate the project YAML file and references. Nothing is executed", + action="store_true", ) group.add_argument( - '--samplingonly', - help='Run the sampling only.', - action='store_true' + "--samplingonly", help="Run the sampling only.", action="store_true" ) group.add_argument( - '--rerun_failed', - help='Rerun the failed jobs', - action='store_true' + "--rerun_failed", help="Rerun the failed jobs", action="store_true" ) # parse CLI arguments @@ -762,10 +844,10 @@ def user_cli(argv=sys.argv[1:]): # load the yaml project file if not os.path.isfile(args.project_filename): raise FileNotFoundError( - 'The project file {} doesn\'t exist'.format(args.project_filename) + "The project file {} doesn't exist".format(args.project_filename) ) project_filename = os.path.abspath(args.project_filename) - with open(project_filename, 'r') as f: + with open(project_filename, "r") as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) # validate the project, and in case of the --validateonly flag return True if validation passes @@ -786,35 +868,37 @@ def user_cli(argv=sys.argv[1:]): # otherwise, queue up the whole eagle buildstockbatch process # the main work of the first Eagle job is to run the sampling script ... - eagle_sh = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'eagle.sh') + eagle_sh = os.path.join(os.path.dirname(os.path.abspath(__file__)), "eagle.sh") assert os.path.exists(eagle_sh) - out_dir = cfg['output_directory'] + out_dir = cfg["output_directory"] if os.path.exists(out_dir): raise FileExistsError( - 'The output directory {} already exists. Please delete it or choose another.'.format(out_dir) + "The output directory {} already exists. Please delete it or choose another.".format( + out_dir + ) ) - logger.info('Creating output directory {}'.format(out_dir)) + logger.info("Creating output directory {}".format(out_dir)) os.makedirs(out_dir) env = {} env.update(os.environ) - env['PROJECTFILE'] = project_filename - env['MY_CONDA_ENV'] = os.environ['CONDA_PREFIX'] - env['MEASURESONLY'] = str(int(args.measures_only)) - env['SAMPLINGONLY'] = str(int(args.samplingonly)) + env["PROJECTFILE"] = project_filename + env["MY_CONDA_ENV"] = os.environ["CONDA_PREFIX"] + env["MEASURESONLY"] = str(int(args.measures_only)) + env["SAMPLINGONLY"] = str(int(args.samplingonly)) subargs = [ - 'sbatch', - '--time={}'.format(cfg['eagle'].get('sampling', {}).get('time', 60)), - '--account={}'.format(cfg['eagle']['account']), - '--nodes=1', - '--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY', - '--output=sampling.out', - eagle_sh + "sbatch", + "--time={}".format(cfg["eagle"].get("sampling", {}).get("time", 60)), + "--account={}".format(cfg["eagle"]["account"]), + "--nodes=1", + "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY", + "--output=sampling.out", + eagle_sh, ] if args.hipri: - subargs.insert(-1, '--qos=high') - logger.info('Submitting sampling job to task scheduler') + subargs.insert(-1, "--qos=high") + logger.info("Submitting sampling job to task scheduler") subprocess.run(subargs, env=env, cwd=out_dir, check=True) - logger.info('Run squeue -u $USER to monitor the progress of your jobs') + logger.info("Run squeue -u $USER to monitor the progress of your jobs") # eagle.sh calls main() @@ -838,18 +922,18 @@ def main(): # only direct script argument is the project .yml file parser = argparse.ArgumentParser() - parser.add_argument('project_filename') + parser.add_argument("project_filename") args = parser.parse_args() # initialize the EagleBatch object batch = EagleBatch(args.project_filename) # other arguments/cues about which part of the process we are in are # encoded in slurm job environment variables - job_array_number = int(os.environ.get('SLURM_ARRAY_TASK_ID', 0)) - post_process = get_bool_env_var('POSTPROCESS') - upload_only = get_bool_env_var('UPLOADONLY') - measures_only = get_bool_env_var('MEASURESONLY') - sampling_only = get_bool_env_var('SAMPLINGONLY') + job_array_number = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0)) + post_process = get_bool_env_var("POSTPROCESS") + upload_only = get_bool_env_var("UPLOADONLY") + measures_only = get_bool_env_var("MEASURESONLY") + sampling_only = get_bool_env_var("SAMPLINGONLY") if job_array_number: # if job array number is non-zero, run the batch job # Simulation should not be scheduled for sampling only @@ -873,7 +957,7 @@ def main(): batch.run_batch(sampling_only) -if __name__ == '__main__': +if __name__ == "__main__": if get_bool_env_var("BUILDSTOCKBATCH_CLI"): user_cli() else: diff --git a/buildstockbatch/local.py b/buildstockbatch/local.py index eea9d32a..70016b6d 100644 --- a/buildstockbatch/local.py +++ b/buildstockbatch/local.py @@ -47,73 +47,91 @@ def __init__(self, project_filename): self._weather_dir = None # Create simulation_output dir - sim_out_ts_dir = os.path.join(self.results_dir, 'simulation_output', 'timeseries') + sim_out_ts_dir = os.path.join( + self.results_dir, "simulation_output", "timeseries" + ) os.makedirs(sim_out_ts_dir, exist_ok=True) - for i in range(0, len(self.cfg.get('upgrades', [])) + 1): - os.makedirs(os.path.join(sim_out_ts_dir, f'up{i:02d}'), exist_ok=True) + for i in range(0, len(self.cfg.get("upgrades", [])) + 1): + os.makedirs(os.path.join(sim_out_ts_dir, f"up{i:02d}"), exist_ok=True) # Install custom gems to a volume that will be used by all workers # FIXME: Get working without docker - if self.cfg.get('baseline', dict()).get('custom_gems', False): + if self.cfg.get("baseline", dict()).get("custom_gems", False): # TODO: Fix this stuff to work without docker - logger.info('Installing custom gems to docker volume: buildstockbatch_custom_gems') + logger.info( + "Installing custom gems to docker volume: buildstockbatch_custom_gems" + ) docker_client = docker.client.from_env() # Create a volume to store the custom gems - docker_client.volumes.create(name='buildstockbatch_custom_gems', driver='local') - simdata_vol = docker_client.volumes.create(name='buildstockbatch_simdata_temp', driver='local') + docker_client.volumes.create( + name="buildstockbatch_custom_gems", driver="local" + ) + simdata_vol = docker_client.volumes.create( + name="buildstockbatch_simdata_temp", driver="local" + ) # Define directories to be mounted in the container - mnt_gem_dir = '/var/oscli/gems' + mnt_gem_dir = "/var/oscli/gems" # Install custom gems to be used in the docker container - local_gemfile_path = os.path.join(self.buildstock_dir, 'resources', 'Gemfile') + local_gemfile_path = os.path.join( + self.buildstock_dir, "resources", "Gemfile" + ) mnt_gemfile_path_orig = "/var/oscli/gemfile/Gemfile" docker_volume_mounts = { - 'buildstockbatch_custom_gems': {'bind': mnt_gem_dir, 'mode': 'rw'}, - local_gemfile_path: {'bind': mnt_gemfile_path_orig, 'mode': 'ro'}, - simdata_vol.name: {'bind': '/var/simdata/openstudio', 'mode': 'rw'}, + "buildstockbatch_custom_gems": {"bind": mnt_gem_dir, "mode": "rw"}, + local_gemfile_path: {"bind": mnt_gemfile_path_orig, "mode": "ro"}, + simdata_vol.name: {"bind": "/var/simdata/openstudio", "mode": "rw"}, } # Check that the Gemfile exists if not os.path.exists(local_gemfile_path): - print(f'local_gemfile_path = {local_gemfile_path}') - raise AttributeError('baseline:custom_gems = True, but did not find Gemfile in /resources directory') + print(f"local_gemfile_path = {local_gemfile_path}") + raise AttributeError( + "baseline:custom_gems = True, but did not find Gemfile in /resources directory" + ) # Make the buildstock/resources/.custom_gems dir to store logs - local_log_dir = os.path.join(self.buildstock_dir, 'resources', '.custom_gems') + local_log_dir = os.path.join( + self.buildstock_dir, "resources", ".custom_gems" + ) if not os.path.exists(local_log_dir): os.makedirs(local_log_dir) # Run bundler to install the custom gems mnt_gemfile_path = f"{mnt_gem_dir}/Gemfile" bundle_install_cmd = f'/bin/bash -c "cp {mnt_gemfile_path_orig} {mnt_gemfile_path} && bundle install --path={mnt_gem_dir} --gemfile={mnt_gemfile_path}"' # noqa: E501 - logger.debug(f'Running {bundle_install_cmd}') + logger.debug(f"Running {bundle_install_cmd}") container_output = docker_client.containers.run( self.docker_image, bundle_install_cmd, remove=True, volumes=docker_volume_mounts, - name='install_custom_gems' + name="install_custom_gems", ) - with open(os.path.join(local_log_dir, 'bundle_install_output.log'), 'wb') as f_out: + with open( + os.path.join(local_log_dir, "bundle_install_output.log"), "wb" + ) as f_out: f_out.write(container_output) # Report out custom gems loaded by OpenStudio CLI - check_active_gems_cmd = f'openstudio --bundle {mnt_gemfile_path} --bundle_path {mnt_gem_dir} ' \ - '--bundle_without native_ext gem_list' + check_active_gems_cmd = ( + f"openstudio --bundle {mnt_gemfile_path} --bundle_path {mnt_gem_dir} " + "--bundle_without native_ext gem_list" + ) container_output = docker_client.containers.run( self.docker_image, check_active_gems_cmd, remove=True, volumes=docker_volume_mounts, - name='list_custom_gems' + name="list_custom_gems", ) - gem_list_log = os.path.join(local_log_dir, 'openstudio_gem_list_output.log') - with open(gem_list_log, 'wb') as f_out: + gem_list_log = os.path.join(local_log_dir, "openstudio_gem_list_output.log") + with open(gem_list_log, "wb") as f_out: f_out.write(container_output) simdata_vol.remove() - logger.debug(f'Review custom gems list at: {gem_list_log}') + logger.debug(f"Review custom gems list at: {gem_list_log}") @classmethod def validate_project(cls, project_file): @@ -124,44 +142,62 @@ def validate_project(cls, project_file): @property def weather_dir(self): if self._weather_dir is None: - self._weather_dir = os.path.join(self.buildstock_dir, 'weather') + self._weather_dir = os.path.join(self.buildstock_dir, "weather") self._get_weather_files() return self._weather_dir @classmethod - def run_building(cls, buildstock_dir, weather_dir, results_dir, measures_only, - n_datapoints, cfg, i, upgrade_idx=None): + def run_building( + cls, + buildstock_dir, + weather_dir, + results_dir, + measures_only, + n_datapoints, + cfg, + i, + upgrade_idx=None, + ): upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1 try: - sim_id, sim_dir = cls.make_sim_dir(i, upgrade_idx, os.path.join(results_dir, 'simulation_output')) + sim_id, sim_dir = cls.make_sim_dir( + i, upgrade_idx, os.path.join(results_dir, "simulation_output") + ) except SimulationExists: return sim_path = pathlib.Path(sim_dir) buildstock_path = pathlib.Path(buildstock_dir) # Make symlinks to project and buildstock stuff - (sim_path / 'measures').symlink_to(buildstock_path / 'measures', target_is_directory=True) - (sim_path / 'lib').symlink_to(buildstock_path / "lib", target_is_directory=True) - (sim_path / 'weather').symlink_to(weather_dir, target_is_directory=True) - hpxml_measures_path = buildstock_path / 'resources' / 'hpxml-measures' + (sim_path / "measures").symlink_to( + buildstock_path / "measures", target_is_directory=True + ) + (sim_path / "lib").symlink_to(buildstock_path / "lib", target_is_directory=True) + (sim_path / "weather").symlink_to(weather_dir, target_is_directory=True) + hpxml_measures_path = buildstock_path / "resources" / "hpxml-measures" if hpxml_measures_path.exists(): - resources_path = sim_path / 'resources' + resources_path = sim_path / "resources" resources_path.mkdir() - (resources_path / 'hpxml-measures').symlink_to(hpxml_measures_path, target_is_directory=True) + (resources_path / "hpxml-measures").symlink_to( + hpxml_measures_path, target_is_directory=True + ) else: resources_path = None - osw = cls.create_osw(cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx) + osw = cls.create_osw( + cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx + ) - with open(sim_path / 'in.osw', 'w') as f: + with open(sim_path / "in.osw", "w") as f: json.dump(osw, f, indent=4) run_cmd = [ cls.openstudio_exe(), - 'run', - '-w', 'in.osw', + "run", + "-w", + "in.osw", ] # FIXME: Custom gems @@ -179,19 +215,19 @@ def run_building(cls, buildstock_dir, weather_dir, results_dir, measures_only, # if cfg.get('baseline', dict()).get('custom_gems', False): # run_cmd.insert(8, '--measures_only') # else: - run_cmd.insert(2, '--measures_only') + run_cmd.insert(2, "--measures_only") env_vars = {} env_vars.update(os.environ) - env_vars['BUILDSTOCKBATCH_VERSION'] = bsb_version + env_vars["BUILDSTOCKBATCH_VERSION"] = bsb_version - max_time_min = cfg.get('max_minutes_per_sim') + max_time_min = cfg.get("max_minutes_per_sim") if max_time_min is not None: subprocess_kw = {"timeout": max_time_min * 60} else: subprocess_kw = {} start_time = dt.datetime.now() - with open(sim_path / 'openstudio_output.log', 'w') as f_out: + with open(sim_path / "openstudio_output.log", "w") as f_out: try: subprocess.run( run_cmd, @@ -200,25 +236,25 @@ def run_building(cls, buildstock_dir, weather_dir, results_dir, measures_only, stderr=subprocess.STDOUT, env=env_vars, cwd=sim_dir, - **subprocess_kw + **subprocess_kw, ) except subprocess.TimeoutExpired: end_time = dt.datetime.now() - msg = f'Terminated {sim_id} after reaching max time of {max_time_min} minutes' + msg = f"Terminated {sim_id} after reaching max time of {max_time_min} minutes" logger.warning(msg) f_out.write(msg) - with open(sim_path / 'out.osw', 'w') as out_osw: + with open(sim_path / "out.osw", "w") as out_osw: out_msg = { - 'started_at': start_time.strftime('%Y%m%dT%H%M%SZ'), - 'completed_at': end_time.strftime('%Y%m%dT%H%M%SZ'), - 'completed_status': 'Fail', - 'timeout': msg + "started_at": start_time.strftime("%Y%m%dT%H%M%SZ"), + "completed_at": end_time.strftime("%Y%m%dT%H%M%SZ"), + "completed_status": "Fail", + "timeout": msg, } out_osw.write(json.dumps(out_msg, indent=3)) - (sim_path / 'run').mkdir(exist_ok=True) - with open(sim_path / 'run' / 'run.log', 'a') as run_log: + (sim_path / "run").mkdir(exist_ok=True) + with open(sim_path / "run" / "run.log", "a") as run_log: run_log.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") - with open(sim_path / 'run' / 'failed.job', 'w') as failed_job: + with open(sim_path / "run" / "failed.job", "w") as failed_job: failed_job.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") time.sleep(20) # Wait for EnergyPlus to release file locks except subprocess.CalledProcessError: @@ -230,19 +266,21 @@ def run_building(cls, buildstock_dir, weather_dir, results_dir, measures_only, fs, f"{results_dir}/simulation_output/timeseries", upgrade_id, - i + i, ) # Clean up symlinks - for directory in ('measures', 'lib', 'weather'): + for directory in ("measures", "lib", "weather"): (sim_path / directory).unlink() if resources_path: - (resources_path / 'hpxml-measures').unlink() + (resources_path / "hpxml-measures").unlink() resources_path.rmdir() # Read data_point_out.json reporting_measures = cls.get_reporting_measures(cfg) - dpout = postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, i) + dpout = postprocessing.read_simulation_outputs( + fs, reporting_measures, sim_dir, upgrade_id, i + ) return dpout def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): @@ -255,12 +293,15 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): # FIXME: does this work for comstock? buildstock_path = pathlib.Path(self.buildstock_dir) project_path = pathlib.Path(self.project_dir) - lib_path = pathlib.Path(self.buildstock_dir, 'lib') + lib_path = pathlib.Path(self.buildstock_dir, "lib") if lib_path.exists(): shutil.rmtree(lib_path) lib_path.mkdir() shutil.copytree(buildstock_path / "resources", lib_path / "resources") - shutil.copytree(project_path / "housing_characteristics", lib_path / "housing_characteristics") + shutil.copytree( + project_path / "housing_characteristics", + lib_path / "housing_characteristics", + ) df = read_csv(buildstock_csv_filename, index_col=0, dtype=str) self.validate_buildstock_csv(self.project_filename, df) @@ -274,11 +315,13 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): self.results_dir, measures_only, n_datapoints, - self.cfg + self.cfg, ) upgrade_sims = [] - for i in range(len(self.cfg.get('upgrades', []))): - upgrade_sims.append(map(functools.partial(run_building_d, upgrade_idx=i), building_ids)) + for i in range(len(self.cfg.get("upgrades", []))): + upgrade_sims.append( + map(functools.partial(run_building_d, upgrade_idx=i), building_ids) + ) if not self.skip_baseline_sims: baseline_sims = map(run_building_d, building_ids) all_sims = itertools.chain(baseline_sims, *upgrade_sims) @@ -291,18 +334,18 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): time.sleep(10) shutil.rmtree(lib_path) - sim_out_path = pathlib.Path(self.results_dir, 'simulation_output') + sim_out_path = pathlib.Path(self.results_dir, "simulation_output") - results_job_json_filename = sim_out_path / 'results_job0.json.gz' - with gzip.open(results_job_json_filename, 'wt', encoding='utf-8') as f: + results_job_json_filename = sim_out_path / "results_job0.json.gz" + with gzip.open(results_job_json_filename, "wt", encoding="utf-8") as f: json.dump(dpouts, f) del dpouts - sim_out_tarfile_name = sim_out_path / 'simulations_job0.tar.gz' - logger.debug(f'Compressing simulation outputs to {sim_out_tarfile_name}') - with tarfile.open(sim_out_tarfile_name, 'w:gz') as tarf: + sim_out_tarfile_name = sim_out_path / "simulations_job0.tar.gz" + logger.debug(f"Compressing simulation outputs to {sim_out_tarfile_name}") + with tarfile.open(sim_out_tarfile_name, "w:gz") as tarf: for dirname in os.listdir(sim_out_path): - if re.match(r'up\d+', dirname) and (sim_out_path / dirname).is_dir(): + if re.match(r"up\d+", dirname) and (sim_out_path / dirname).is_dir(): tarf.add(sim_out_path / dirname, arcname=dirname) shutil.rmtree(sim_out_path / dirname) @@ -313,8 +356,7 @@ def output_dir(self): @property def results_dir(self): results_dir = self.cfg.get( - 'output_directory', - os.path.join(self.project_dir, 'localResults') + "output_directory", os.path.join(self.project_dir, "localResults") ) results_dir = self.path_rel_to_projectfile(results_dir) if not os.path.isdir(results_dir): @@ -322,70 +364,86 @@ def results_dir(self): return results_dir def get_dask_client(self): - cluster = LocalCluster(local_directory=os.path.join(self.results_dir, 'dask-tmp')) + cluster = LocalCluster( + local_directory=os.path.join(self.results_dir, "dask-tmp") + ) return Client(cluster) @log_error_details() def main(): - logging.config.dictConfig({ - 'version': 1, - 'disable_existing_loggers': True, - 'formatters': { - 'defaultfmt': { - 'format': '%(levelname)s:%(asctime)s:%(name)s:%(message)s', - 'datefmt': '%Y-%m-%d %H:%M:%S' - } - }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'formatter': 'defaultfmt', - 'level': 'DEBUG', - 'stream': 'ext://sys.stdout', - } - }, - 'loggers': { - '__main__': { - 'level': 'DEBUG', - 'propagate': True, - 'handlers': ['console'] + logging.config.dictConfig( + { + "version": 1, + "disable_existing_loggers": True, + "formatters": { + "defaultfmt": { + "format": "%(levelname)s:%(asctime)s:%(name)s:%(message)s", + "datefmt": "%Y-%m-%d %H:%M:%S", + } }, - 'buildstockbatch': { - 'level': 'DEBUG', - 'propagate': True, - 'handlers': ['console'] - } - }, - }) + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "defaultfmt", + "level": "DEBUG", + "stream": "ext://sys.stdout", + } + }, + "loggers": { + "__main__": { + "level": "DEBUG", + "propagate": True, + "handlers": ["console"], + }, + "buildstockbatch": { + "level": "DEBUG", + "propagate": True, + "handlers": ["console"], + }, + }, + } + ) parser = argparse.ArgumentParser() print(BuildStockBatchBase.LOGO) - parser.add_argument('project_filename') + parser.add_argument("project_filename") parser.add_argument( - '-j', + "-j", type=int, - help='Number of parallel simulations. Default: all cores.', - default=None + help="Number of parallel simulations. Default: all cores.", + default=None, ) parser.add_argument( - '-m', '--measures_only', - action='store_true', - help='Only apply the measures, but don\'t run simulations. Useful for debugging.' + "-m", + "--measures_only", + action="store_true", + help="Only apply the measures, but don't run simulations. Useful for debugging.", ) group = parser.add_mutually_exclusive_group() - group.add_argument('--postprocessonly', - help='Only do postprocessing, useful for when the simulations are already done', - action='store_true') - group.add_argument('--uploadonly', - help='Only upload to S3, useful when postprocessing is already done. Ignores the ' - 'upload flag in yaml', action='store_true') - group.add_argument('--validateonly', help='Only validate the project YAML file and references. Nothing is executed', - action='store_true') - group.add_argument('--samplingonly', help='Run the sampling only.', - action='store_true') + group.add_argument( + "--postprocessonly", + help="Only do postprocessing, useful for when the simulations are already done", + action="store_true", + ) + group.add_argument( + "--uploadonly", + help="Only upload to S3, useful when postprocessing is already done. Ignores the " + "upload flag in yaml", + action="store_true", + ) + group.add_argument( + "--validateonly", + help="Only validate the project YAML file and references. Nothing is executed", + action="store_true", + ) + group.add_argument( + "--samplingonly", help="Run the sampling only.", action="store_true" + ) args = parser.parse_args() if not os.path.isfile(args.project_filename): - raise FileNotFoundError(f'The project file {args.project_filename} doesn\'t exist') + raise FileNotFoundError( + f"The project file {args.project_filename} doesn't exist" + ) # Validate the project, and in case of the --validateonly flag return True if validation passes LocalBatch.validate_project(args.project_filename) @@ -393,7 +451,11 @@ def main(): return True batch = LocalBatch(args.project_filename) if not (args.postprocessonly or args.uploadonly or args.validateonly): - batch.run_batch(n_jobs=args.j, measures_only=args.measures_only, sampling_only=args.samplingonly) + batch.run_batch( + n_jobs=args.j, + measures_only=args.measures_only, + sampling_only=args.samplingonly, + ) if args.measures_only or args.samplingonly: return if args.uploadonly: @@ -402,5 +464,5 @@ def main(): batch.process_results() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 09e0eda8..df8c2f27 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -40,88 +40,79 @@ def read_data_point_out_json(fs, reporting_measures, filename): try: - with fs.open(filename, 'r') as f: + with fs.open(filename, "r") as f: d = json.load(f) except (FileNotFoundError, json.JSONDecodeError): return None else: - sim_out_report = 'SimulationOutputReport' - if 'ReportSimulationOutput' in d: - sim_out_report = 'ReportSimulationOutput' + sim_out_report = "SimulationOutputReport" + if "ReportSimulationOutput" in d: + sim_out_report = "ReportSimulationOutput" if sim_out_report not in d: - d[sim_out_report] = {'applicable': False} + d[sim_out_report] = {"applicable": False} for reporting_measure in reporting_measures: if reporting_measure not in d: - d[reporting_measure] = {'applicable': False} + d[reporting_measure] = {"applicable": False} return d def to_camelcase(x): - s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', x) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", x) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() def flatten_datapoint_json(reporting_measures, d): new_d = {} - cols_to_keep = { - 'ApplyUpgrade': [ - 'upgrade_name', - 'applicable' - ] - } + cols_to_keep = {"ApplyUpgrade": ["upgrade_name", "applicable"]} for k1, k2s in cols_to_keep.items(): for k2 in k2s: - new_d[f'{k1}.{k2}'] = d.get(k1, {}).get(k2) + new_d[f"{k1}.{k2}"] = d.get(k1, {}).get(k2) # copy over all the key and values from BuildExistingModel - col1 = 'BuildExistingModel' + col1 = "BuildExistingModel" for k, v in d.get(col1, {}).items(): - new_d[f'{col1}.{k}'] = v + new_d[f"{col1}.{k}"] = v # if there is no units_represented key, default to 1 # TODO @nmerket @rajeee is there a way to not apply this to Commercial jobs? It doesn't hurt, but it is weird for us - units = int(new_d.get(f'{col1}.units_represented', 1)) - new_d[f'{col1}.units_represented'] = units - sim_out_report = 'SimulationOutputReport' - if 'ReportSimulationOutput' in d: - sim_out_report = 'ReportSimulationOutput' + units = int(new_d.get(f"{col1}.units_represented", 1)) + new_d[f"{col1}.units_represented"] = units + sim_out_report = "SimulationOutputReport" + if "ReportSimulationOutput" in d: + sim_out_report = "ReportSimulationOutput" col2 = sim_out_report for k, v in d.get(col2, {}).items(): - new_d[f'{col2}.{k}'] = v + new_d[f"{col2}.{k}"] = v # additional reporting measures - if sim_out_report == 'ReportSimulationOutput': - reporting_measures += ['ReportUtilityBills'] - reporting_measures += ['UpgradeCosts'] + if sim_out_report == "ReportSimulationOutput": + reporting_measures += ["ReportUtilityBills"] + reporting_measures += ["UpgradeCosts"] for col in reporting_measures: for k, v in d.get(col, {}).items(): - new_d[f'{col}.{k}'] = v + new_d[f"{col}.{k}"] = v - new_d['building_id'] = new_d['BuildExistingModel.building_id'] - del new_d['BuildExistingModel.building_id'] + new_d["building_id"] = new_d["BuildExistingModel.building_id"] + del new_d["BuildExistingModel.building_id"] return new_d def read_out_osw(fs, filename): try: - with fs.open(filename, 'r') as f: + with fs.open(filename, "r") as f: d = json.load(f) except (FileNotFoundError, json.JSONDecodeError): return None else: out_d = {} - keys_to_copy = [ - 'started_at', - 'completed_at', - 'completed_status' - ] + keys_to_copy = ["started_at", "completed_at", "completed_status"] for key in keys_to_copy: out_d[key] = d.get(key, None) - for step in d.get('steps', []): - if step['measure_dir_name'] == 'BuildExistingModel': - out_d['building_id'] = step['arguments']['building_id'] + for step in d.get("steps", []): + if step["measure_dir_name"] == "BuildExistingModel": + out_d["building_id"] = step["arguments"]["building_id"] return out_d @@ -142,71 +133,98 @@ def read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, buildin """ dpout = read_data_point_out_json( - fs, reporting_measures, f'{sim_dir}/run/data_point_out.json' + fs, reporting_measures, f"{sim_dir}/run/data_point_out.json" ) if dpout is None: dpout = {} else: dpout = flatten_datapoint_json(reporting_measures, dpout) - out_osw = read_out_osw(fs, f'{sim_dir}/out.osw') + out_osw = read_out_osw(fs, f"{sim_dir}/out.osw") if out_osw: dpout.update(out_osw) - dpout['upgrade'] = upgrade_id - dpout['building_id'] = building_id + dpout["upgrade"] = upgrade_id + dpout["building_id"] = building_id return dpout def write_dataframe_as_parquet(df, fs, filename, schema=None): tbl = pa.Table.from_pandas(df, schema=schema, preserve_index=False) - with fs.open(filename, 'wb') as f: + with fs.open(filename, "wb") as f: parquet.write_table(tbl, f) def clean_up_results_df(df, cfg, keep_upgrade_id=False): results_df = df.copy() cols_to_remove = ( - 'build_existing_model.weight', - 'simulation_output_report.weight', - 'build_existing_model.workflow_json', - 'simulation_output_report.upgrade_name' + "build_existing_model.weight", + "simulation_output_report.weight", + "build_existing_model.workflow_json", + "simulation_output_report.upgrade_name", ) for col in cols_to_remove: if col in results_df.columns: del results_df[col] - for col in ('started_at', 'completed_at'): + for col in ("started_at", "completed_at"): if col in results_df.columns: results_df[col] = results_df[col].map( - lambda x: dt.datetime.strptime(x, '%Y%m%dT%H%M%SZ') if isinstance(x, str) else x + lambda x: ( + dt.datetime.strptime(x, "%Y%m%dT%H%M%SZ") + if isinstance(x, str) + else x + ) ) - reference_scenarios = dict([(i, x.get('reference_scenario')) for i, x in enumerate(cfg.get('upgrades', []), 1)]) - results_df['apply_upgrade.reference_scenario'] = \ - results_df['upgrade'].map(reference_scenarios).fillna('').astype(str) + reference_scenarios = dict( + [ + (i, x.get("reference_scenario")) + for i, x in enumerate(cfg.get("upgrades", []), 1) + ] + ) + results_df["apply_upgrade.reference_scenario"] = ( + results_df["upgrade"].map(reference_scenarios).fillna("").astype(str) + ) # standardize the column orders first_few_cols = [ - 'building_id', - 'started_at', - 'completed_at', - 'completed_status', - 'apply_upgrade.applicable', - 'apply_upgrade.upgrade_name', - 'apply_upgrade.reference_scenario' + "building_id", + "started_at", + "completed_at", + "completed_status", + "apply_upgrade.applicable", + "apply_upgrade.upgrade_name", + "apply_upgrade.reference_scenario", ] if keep_upgrade_id: - first_few_cols.insert(1, 'upgrade') - if 'job_id' in results_df.columns: - first_few_cols.insert(2, 'job_id') - - build_existing_model_cols = sorted([col for col in results_df.columns if col.startswith('build_existing_model')]) - sim_output_report_cols = sorted([col for col in results_df.columns if col.startswith('simulation_output_report')]) - report_sim_output_cols = sorted([col for col in results_df.columns if col.startswith('report_simulation_output')]) - upgrade_costs_cols = sorted([col for col in results_df.columns if col.startswith('upgrade_costs')]) - sorted_cols = \ - first_few_cols + \ - build_existing_model_cols + \ - sim_output_report_cols + \ - report_sim_output_cols + \ - upgrade_costs_cols + first_few_cols.insert(1, "upgrade") + if "job_id" in results_df.columns: + first_few_cols.insert(2, "job_id") + + build_existing_model_cols = sorted( + [col for col in results_df.columns if col.startswith("build_existing_model")] + ) + sim_output_report_cols = sorted( + [ + col + for col in results_df.columns + if col.startswith("simulation_output_report") + ] + ) + report_sim_output_cols = sorted( + [ + col + for col in results_df.columns + if col.startswith("report_simulation_output") + ] + ) + upgrade_costs_cols = sorted( + [col for col in results_df.columns if col.startswith("upgrade_costs")] + ) + sorted_cols = ( + first_few_cols + + build_existing_model_cols + + sim_output_report_cols + + report_sim_output_cols + + upgrade_costs_cols + ) remaining_cols = sorted(set(results_df.columns.values).difference(sorted_cols)) sorted_cols += remaining_cols @@ -217,17 +235,17 @@ def clean_up_results_df(df, cfg, keep_upgrade_id=False): def get_cols(fs, filepath): - with fs.open(filepath, 'rb') as f: + with fs.open(filepath, "rb") as f: schema = parquet.read_schema(f) return set(schema.names) def read_results_json(fs, filename, all_cols=None): - with fs.open(filename, 'rb') as f1: - with gzip.open(f1, 'rt', encoding='utf-8') as f2: + with fs.open(filename, "rb") as f1: + with gzip.open(f1, "rt", encoding="utf-8") as f2: dpouts = json.load(f2) df = pd.DataFrame(dpouts) - df['job_id'] = int(re.search(r'results_job(\d+)\.json\.gz', filename).group(1)) + df["job_id"] = int(re.search(r"results_job(\d+)\.json\.gz", filename).group(1)) if all_cols is not None: for missing_col in set(all_cols).difference(df.columns.values): df[missing_col] = None @@ -238,7 +256,7 @@ def read_results_json(fs, filename, all_cols=None): def get_schema_dict(fs, filename): df = read_results_json(fs, filename) - df = df.replace('', np.nan) # required to make pa correctly infer the dtypes + df = df.replace("", np.nan) # required to make pa correctly infer the dtypes sch = pa.Schema.from_pandas(df) sch_dict = {name: type for name, type in zip(sch.names, sch.types)} return sch_dict @@ -255,17 +273,19 @@ def merge_schema_dicts(dict1, dict2): def read_enduse_timeseries_parquet(fs, all_cols, src_path, bldg_id): src_filename = f"{src_path}/bldg{bldg_id:07}.parquet" - with fs.open(src_filename, 'rb') as f: - df = pd.read_parquet(f, engine='pyarrow') - df['building_id'] = bldg_id + with fs.open(src_filename, "rb") as f: + df = pd.read_parquet(f, engine="pyarrow") + df["building_id"] = bldg_id for col in set(all_cols).difference(df.columns.values): df[col] = np.nan df = df[all_cols] - df.set_index('building_id', inplace=True) + df.set_index("building_id", inplace=True) return df -def concat_and_normalize(fs, all_cols, src_path, dst_path, partition_columns, indx, bldg_ids, partition_vals): +def concat_and_normalize( + fs, all_cols, src_path, dst_path, partition_columns, indx, bldg_ids, partition_vals +): dfs = [] for bldg_id in sorted(bldg_ids): df = read_enduse_timeseries_parquet(fs, all_cols, src_path, bldg_id) @@ -280,7 +300,7 @@ def concat_and_normalize(fs, all_cols, src_path, dst_path, partition_columns, in fs.makedirs(dst_filepath, exist_ok=True) dst_filename = f"{dst_filepath}/group{indx}.parquet" - with fs.open(dst_filename, 'wb') as f: + with fs.open(dst_filename, "wb") as f: df.to_parquet(f, index=True) return len(bldg_ids) @@ -327,24 +347,34 @@ def split_into_groups(total_size, max_group_size): def get_partitioned_bldg_groups(partition_df, partition_columns, files_per_partition): """ - Returns intelligent grouping of building_ids by partition columns. - 1. Group the building_ids by partition columns. For each group, say (CO, Jefferson), we have a list of building - ids. The total number of such groups is ngroups - 2. Concatenate those list to get bldg_id_list, which will have all the bldg_ids but ordered such that that - buildings belonging to the same group are close together. - 3. Split the list of building in each group in 1 to multiple subgroups so that total number of buildings - in each subgroup is less than or equal to files_per_partition. This will give the bldg_id_groups (list of - list) used to read the dataframe. The buildings within the inner list will be concatenated. - len(bldg_id_groups) is equal to number of such concatenation, and eventually, number of output parquet files. + Returns intelligent grouping of building_ids by partition columns. + 1. Group the building_ids by partition columns. For each group, say (CO, Jefferson), we have a list of building + ids. The total number of such groups is ngroups + 2. Concatenate those list to get bldg_id_list, which will have all the bldg_ids but ordered such that that + buildings belonging to the same group are close together. + 3. Split the list of building in each group in 1 to multiple subgroups so that total number of buildings + in each subgroup is less than or equal to files_per_partition. This will give the bldg_id_groups (list of + list) used to read the dataframe. The buildings within the inner list will be concatenated. + len(bldg_id_groups) is equal to number of such concatenation, and eventually, number of output parquet files. """ total_building = len(partition_df) if partition_columns: - bldg_id_list_df = partition_df.reset_index().groupby(partition_columns)['building_id'].apply(list) + bldg_id_list_df = ( + partition_df.reset_index() + .groupby(partition_columns)["building_id"] + .apply(list) + ) ngroups = len(bldg_id_list_df) bldg_id_list = bldg_id_list_df.sum() - nfiles_in_each_group = [nfiles for nfiles in bldg_id_list_df.map(lambda x: len(x))] - files_groups = [split_into_groups(n, files_per_partition) for n in nfiles_in_each_group] - flat_groups = [n for group in files_groups for n in group] # flatten list of list into a list (maintain order) + nfiles_in_each_group = [ + nfiles for nfiles in bldg_id_list_df.map(lambda x: len(x)) + ] + files_groups = [ + split_into_groups(n, files_per_partition) for n in nfiles_in_each_group + ] + flat_groups = [ + n for group in files_groups for n in group + ] # flatten list of list into a list (maintain order) else: # no partitioning by a column. Just put buildings into groups of files_per_partition ngroups = 1 @@ -363,8 +393,8 @@ def get_partitioned_bldg_groups(partition_df, partition_columns, files_per_parti def get_upgrade_list(cfg): - upgrade_start = 1 if cfg['baseline'].get('skip_sims', False) else 0 - upgrade_end = len(cfg.get('upgrades', [])) + 1 + upgrade_start = 1 if cfg["baseline"].get("skip_sims", False) else 0 + upgrade_end = len(cfg.get("upgrades", [])) + 1 return list(range(upgrade_start, upgrade_end)) @@ -375,7 +405,7 @@ def write_metadata_files(fs, parquet_root_dir, partition_columns): logger.info(f"Written _common_metadata to {parquet_root_dir}") if partition_columns: - partition_glob = "/".join([f'{c}*' for c in partition_columns]) + partition_glob = "/".join([f"{c}*" for c in partition_columns]) glob_str = f"{parquet_root_dir}/up*/{partition_glob}/*.parquet" else: glob_str = f"{parquet_root_dir}/up*/*.parquet" @@ -384,7 +414,9 @@ def write_metadata_files(fs, parquet_root_dir, partition_columns): concat_files = fs.glob(glob_str) logger.info(f"Gathered {len(concat_files)} files. Now writing _metadata") parquet_root_dir = Path(parquet_root_dir).as_posix() - create_metadata_file(concat_files, root_dir=parquet_root_dir, engine='pyarrow', fs=fs) + create_metadata_file( + concat_files, root_dir=parquet_root_dir, engine="pyarrow", fs=fs + ) logger.info(f"_metadata file written to {parquet_root_dir}") @@ -400,11 +432,11 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): :param do_timeseries: process timeseries results, defaults to True :type do_timeseries: bool, optional """ - sim_output_dir = f'{results_dir}/simulation_output' - ts_in_dir = f'{sim_output_dir}/timeseries' - results_csvs_dir = f'{results_dir}/results_csvs' - parquet_dir = f'{results_dir}/parquet' - ts_dir = f'{results_dir}/parquet/timeseries' + sim_output_dir = f"{results_dir}/simulation_output" + ts_in_dir = f"{sim_output_dir}/timeseries" + results_csvs_dir = f"{results_dir}/results_csvs" + parquet_dir = f"{results_dir}/parquet" + ts_dir = f"{results_dir}/parquet/timeseries" dirs = [results_csvs_dir, parquet_dir] if do_timeseries: dirs.append(ts_dir) @@ -414,42 +446,61 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): fs.makedirs(dr) # Results "CSV" - results_json_files = fs.glob(f'{sim_output_dir}/results_job*.json.gz') + results_json_files = fs.glob(f"{sim_output_dir}/results_job*.json.gz") if not results_json_files: raise ValueError("No simulation results found to post-process.") - logger.info("Collecting all the columns and datatypes in results_job*.json.gz parquet files.") - all_schema_dict = db.from_sequence(results_json_files).map(partial(get_schema_dict, fs)).\ - fold(lambda x, y: merge_schema_dicts(x, y)).compute() + logger.info( + "Collecting all the columns and datatypes in results_job*.json.gz parquet files." + ) + all_schema_dict = ( + db.from_sequence(results_json_files) + .map(partial(get_schema_dict, fs)) + .fold(lambda x, y: merge_schema_dicts(x, y)) + .compute() + ) logger.info(f"Got {len(all_schema_dict)} columns") all_results_cols = list(all_schema_dict.keys()) - all_schema_dict = {to_camelcase(key): value for key, value in all_schema_dict.items()} + all_schema_dict = { + to_camelcase(key): value for key, value in all_schema_dict.items() + } logger.info(f"Got this schema: {all_schema_dict}\n") - delayed_results_dfs = [dask.delayed(partial(read_results_json, fs, all_cols=all_results_cols))(x) - for x in results_json_files] - results_df = dd.from_delayed(delayed_results_dfs, verify_meta=False) + delayed_results_dfs = [ + dask.delayed(partial(read_results_json, fs, all_cols=all_results_cols))(x) + for x in results_json_files + ] + results_df = dd.from_delayed(delayed_results_dfs, verify_meta=False) if do_timeseries: # Look at all the parquet files to see what columns are in all of them. logger.info("Collecting all the columns in timeseries parquet files.") do_timeseries = False all_ts_cols = set() - for upgrade_folder in fs.glob(f'{ts_in_dir}/up*'): + for upgrade_folder in fs.glob(f"{ts_in_dir}/up*"): ts_filenames = fs.ls(upgrade_folder) if ts_filenames: do_timeseries = True - logger.info(f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}.") + logger.info( + f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}." + ) files_bag = db.from_sequence(ts_filenames, partition_size=100) - all_ts_cols |= files_bag.map(partial(get_cols, fs)).\ - fold(lambda x, y: x.union(y)).compute() + all_ts_cols |= ( + files_bag.map(partial(get_cols, fs)) + .fold(lambda x, y: x.union(y)) + .compute() + ) logger.info("Collected all the columns") else: - logger.info(f"There are no timeseries files for upgrade {Path(upgrade_folder).name}.") + logger.info( + f"There are no timeseries files for upgrade {Path(upgrade_folder).name}." + ) # Sort the columns - all_ts_cols_sorted = ['building_id'] + sorted(x for x in all_ts_cols if x.startswith('time')) + all_ts_cols_sorted = ["building_id"] + sorted( + x for x in all_ts_cols if x.startswith("time") + ) all_ts_cols.difference_update(all_ts_cols_sorted) - all_ts_cols_sorted.extend(sorted(x for x in all_ts_cols if not x.endswith(']'))) + all_ts_cols_sorted.extend(sorted(x for x in all_ts_cols if not x.endswith("]"))) all_ts_cols.difference_update(all_ts_cols_sorted) all_ts_cols_sorted.extend(sorted(all_ts_cols)) logger.info(f"Got {len(all_ts_cols_sorted)} columns in total") @@ -457,14 +508,16 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): else: logger.warning("There are no timeseries files for any upgrades.") - results_df_groups = results_df.groupby('upgrade') + results_df_groups = results_df.groupby("upgrade") upgrade_list = get_upgrade_list(cfg) - partition_columns = cfg.get('postprocessing', {}).get('partition_columns', []) + partition_columns = cfg.get("postprocessing", {}).get("partition_columns", []) partition_columns = [c.lower() for c in partition_columns] - df_partition_columns = [f'build_existing_model.{c}' for c in partition_columns] + df_partition_columns = [f"build_existing_model.{c}" for c in partition_columns] missing_cols = set(df_partition_columns) - set(all_schema_dict.keys()) if missing_cols: - raise ValueError(f"The following partitioning columns are not found in results.json: {missing_cols}") + raise ValueError( + f"The following partitioning columns are not found in results.json: {missing_cols}" + ) if partition_columns: logger.info(f"The timeseries files will be partitioned by {partition_columns}.") @@ -475,17 +528,21 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): logger.info(f"Obtained results_df for {upgrade_id} with {len(df)} datapoints. ") df.rename(columns=to_camelcase, inplace=True) df = clean_up_results_df(df, cfg, keep_upgrade_id=True) - del df['upgrade'] - df.set_index('building_id', inplace=True) + del df["upgrade"] + df.set_index("building_id", inplace=True) df.sort_index(inplace=True) schema = None partition_df = df[df_partition_columns].copy() - partition_df.rename(columns={df_c: c for df_c, c in zip(df_partition_columns, partition_columns)}, - inplace=True) + partition_df.rename( + columns={ + df_c: c for df_c, c in zip(df_partition_columns, partition_columns) + }, + inplace=True, + ) if upgrade_id > 0: # Remove building characteristics for upgrade scenarios. cols_to_keep = list( - filter(lambda x: not x.startswith('build_existing_model.'), df.columns) + filter(lambda x: not x.startswith("build_existing_model."), df.columns) ) df = df[cols_to_keep] null_cols = get_null_cols(df) @@ -495,15 +552,19 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): logger.info(f"Upgrade {upgrade_id} has null cols: {null_cols}") schema, unresolved = correct_schema(all_schema_dict, df) if unresolved: - logger.info(f"The types for {unresolved} columns couldn't be determined.") + logger.info( + f"The types for {unresolved} columns couldn't be determined." + ) else: - logger.info("All columns were successfully assigned a datatype based on other upgrades.") + logger.info( + "All columns were successfully assigned a datatype based on other upgrades." + ) # Write CSV csv_filename = f"{results_csvs_dir}/results_up{upgrade_id:02d}.csv.gz" - logger.info(f'Writing {csv_filename}') - with fs.open(csv_filename, 'wb') as f: - with gzip.open(f, 'wt', encoding='utf-8') as gf: - df.to_csv(gf, index=True, lineterminator='\n') + logger.info(f"Writing {csv_filename}") + with fs.open(csv_filename, "wb") as f: + with gzip.open(f, "wt", encoding="utf-8") as gf: + df.to_csv(gf, index=True, lineterminator="\n") # Write Parquet if upgrade_id == 0: @@ -513,43 +574,64 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): fs.makedirs(results_parquet_dir) parquet_filename = f"{results_parquet_dir}/results_up{upgrade_id:02d}.parquet" - logger.info(f'Writing {parquet_filename}') + logger.info(f"Writing {parquet_filename}") write_dataframe_as_parquet( - df.reset_index(), - fs, - parquet_filename, - schema=schema + df.reset_index(), fs, parquet_filename, schema=schema ) if do_timeseries: # Get the names of the timeseries file for each simulation in this upgrade - ts_upgrade_path = f'{ts_in_dir}/up{upgrade_id:02d}' - ts_filenames = [ts_upgrade_path + ts_filename for ts_filename in fs.ls(ts_upgrade_path)] - ts_bldg_ids = [int(re.search(r'bldg(\d+).parquet', flname).group(1)) for flname in ts_filenames] + ts_upgrade_path = f"{ts_in_dir}/up{upgrade_id:02d}" + ts_filenames = [ + ts_upgrade_path + ts_filename for ts_filename in fs.ls(ts_upgrade_path) + ] + ts_bldg_ids = [ + int(re.search(r"bldg(\d+).parquet", flname).group(1)) + for flname in ts_filenames + ] if not ts_filenames: - logger.warning(f"There are no timeseries files for upgrade{upgrade_id}.") + logger.warning( + f"There are no timeseries files for upgrade{upgrade_id}." + ) continue - logger.info(f"There are {len(ts_filenames)} timeseries files for upgrade{upgrade_id}.") + logger.info( + f"There are {len(ts_filenames)} timeseries files for upgrade{upgrade_id}." + ) # Calculate the mean and estimate the total memory usage - read_ts_parquet = partial(read_enduse_timeseries_parquet, fs, all_ts_cols_sorted, ts_upgrade_path) - get_ts_mem_usage_d = dask.delayed(lambda x: read_ts_parquet(x).memory_usage(deep=True).sum()) + read_ts_parquet = partial( + read_enduse_timeseries_parquet, fs, all_ts_cols_sorted, ts_upgrade_path + ) + get_ts_mem_usage_d = dask.delayed( + lambda x: read_ts_parquet(x).memory_usage(deep=True).sum() + ) sample_size = min(len(ts_bldg_ids), 36 * 3) - mean_mem = np.mean(dask.compute(map(get_ts_mem_usage_d, random.sample(ts_bldg_ids, sample_size)))[0]) + mean_mem = np.mean( + dask.compute( + map(get_ts_mem_usage_d, random.sample(ts_bldg_ids, sample_size)) + )[0] + ) # Determine how many files should be in each partition and group the files - parquet_memory = int(cfg.get('eagle', {}).get('postprocessing', {} - ).get('parquet_memory_mb', MAX_PARQUET_MEMORY)) + parquet_memory = int( + cfg.get("eagle", {}) + .get("postprocessing", {}) + .get("parquet_memory_mb", MAX_PARQUET_MEMORY) + ) logger.info(f"Max parquet memory: {parquet_memory} MB") - max_files_per_partition = max(1, math.floor(parquet_memory / (mean_mem / 1e6))) + max_files_per_partition = max( + 1, math.floor(parquet_memory / (mean_mem / 1e6)) + ) partition_df = partition_df.loc[ts_bldg_ids].copy() logger.info(f"partition_df for the upgrade has {len(partition_df)} rows.") - bldg_id_groups, bldg_id_list, ngroup = get_partitioned_bldg_groups(partition_df, - partition_columns, - max_files_per_partition) - logger.info(f"Processing {len(bldg_id_list)} building timeseries by combining max of " - f"{max_files_per_partition} parquets together. This will create {len(bldg_id_groups)} parquet " - f"partitions which go into {ngroup} column group(s) of {partition_columns}") + bldg_id_groups, bldg_id_list, ngroup = get_partitioned_bldg_groups( + partition_df, partition_columns, max_files_per_partition + ) + logger.info( + f"Processing {len(bldg_id_list)} building timeseries by combining max of " + f"{max_files_per_partition} parquets together. This will create {len(bldg_id_groups)} parquet " + f"partitions which go into {ngroup} column group(s) of {partition_columns}" + ) if isinstance(fs, LocalFileSystem): ts_out_loc = f"{ts_dir}/upgrade={upgrade_id}/" @@ -558,22 +640,49 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ts_out_loc = f"s3://{ts_dir}/upgrade={upgrade_id}" fs.makedirs(ts_out_loc) - logger.info(f'Created directory {ts_out_loc} for writing. Now concatenating ...') + logger.info( + f"Created directory {ts_out_loc} for writing. Now concatenating ..." + ) - src_path = f'{ts_in_dir}/up{upgrade_id:02d}' - concat_partial = dask.delayed(partial(concat_and_normalize, - fs, all_ts_cols_sorted, src_path, ts_out_loc, partition_columns)) - partition_vals_list = [list(partition_df.loc[bldg_id_list[0]].values) if partition_columns else [] - for bldg_id_list in bldg_id_groups] + src_path = f"{ts_in_dir}/up{upgrade_id:02d}" + concat_partial = dask.delayed( + partial( + concat_and_normalize, + fs, + all_ts_cols_sorted, + src_path, + ts_out_loc, + partition_columns, + ) + ) + partition_vals_list = [ + ( + list(partition_df.loc[bldg_id_list[0]].values) + if partition_columns + else [] + ) + for bldg_id_list in bldg_id_groups + ] with tempfile.TemporaryDirectory() as tmpdir: - tmpfilepath = Path(tmpdir, 'dask-report.html') + tmpfilepath = Path(tmpdir, "dask-report.html") with performance_report(filename=str(tmpfilepath)): - dask.compute(map(concat_partial, *zip(*enumerate(bldg_id_groups)), partition_vals_list)) + dask.compute( + map( + concat_partial, + *zip(*enumerate(bldg_id_groups)), + partition_vals_list, + ) + ) if tmpfilepath.exists(): - fs.put_file(str(tmpfilepath), f'{results_dir}/dask_combine_report{upgrade_id}.html') + fs.put_file( + str(tmpfilepath), + f"{results_dir}/dask_combine_report{upgrade_id}.html", + ) - logger.info(f"Finished combining and saving timeseries for upgrade{upgrade_id}.") + logger.info( + f"Finished combining and saving timeseries for upgrade{upgrade_id}." + ) logger.info("All aggregation completed. ") if do_timeseries: logger.info("Writing timeseries metadata files") @@ -582,13 +691,13 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): def remove_intermediate_files(fs, results_dir, keep_individual_timeseries=False): # Remove aggregated files to save space - sim_output_dir = f'{results_dir}/simulation_output' - results_job_json_glob = f'{sim_output_dir}/results_job*.json.gz' - logger.info('Removing results_job*.json.gz') + sim_output_dir = f"{results_dir}/simulation_output" + results_job_json_glob = f"{sim_output_dir}/results_job*.json.gz" + logger.info("Removing results_job*.json.gz") for filename in fs.glob(results_job_json_glob): fs.rm(filename) if not keep_individual_timeseries: - ts_in_dir = f'{sim_output_dir}/timeseries' + ts_in_dir = f"{sim_output_dir}/timeseries" fs.rm(ts_in_dir, recursive=True) @@ -596,35 +705,43 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): logger.info("Uploading the parquet files to s3") output_folder_name = Path(output_dir).name - parquet_dir = Path(results_dir).joinpath('parquet') - ts_dir = parquet_dir / 'timeseries' + parquet_dir = Path(results_dir).joinpath("parquet") + ts_dir = parquet_dir / "timeseries" if not parquet_dir.is_dir(): - logger.error(f"{parquet_dir} does not exist. Please make sure postprocessing has been done.") + logger.error( + f"{parquet_dir} does not exist. Please make sure postprocessing has been done." + ) raise FileNotFoundError(parquet_dir) all_files = [] - for file in parquet_dir.rglob('*.parquet'): + for file in parquet_dir.rglob("*.parquet"): all_files.append(file.relative_to(parquet_dir)) - for file in [*ts_dir.glob('_common_metadata'), *ts_dir.glob('_metadata')]: + for file in [*ts_dir.glob("_common_metadata"), *ts_dir.glob("_metadata")]: all_files.append(file.relative_to(parquet_dir)) - s3_prefix = aws_conf.get('s3', {}).get('prefix', '').rstrip('/') - s3_bucket = aws_conf.get('s3', {}).get('bucket', None) + s3_prefix = aws_conf.get("s3", {}).get("prefix", "").rstrip("/") + s3_bucket = aws_conf.get("s3", {}).get("bucket", None) if not (s3_prefix and s3_bucket): - logger.error("YAML file missing postprocessing:aws:s3:prefix and/or bucket entry.") + logger.error( + "YAML file missing postprocessing:aws:s3:prefix and/or bucket entry." + ) return - s3_prefix_output = s3_prefix + '/' + output_folder_name + '/' + s3_prefix_output = s3_prefix + "/" + output_folder_name + "/" - s3 = boto3.resource('s3') + s3 = boto3.resource("s3") bucket = s3.Bucket(s3_bucket) n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix_output))) if n_existing_files > 0: - logger.error(f"There are already {n_existing_files} files in the s3 folder {s3_bucket}/{s3_prefix_output}.") + logger.error( + f"There are already {n_existing_files} files in the s3 folder {s3_bucket}/{s3_prefix_output}." + ) raise FileExistsError(f"s3://{s3_bucket}/{s3_prefix_output}") def upload_file(filepath, s3key=None): - full_path = filepath if filepath.is_absolute() else parquet_dir.joinpath(filepath) - s3 = boto3.resource('s3') + full_path = ( + filepath if filepath.is_absolute() else parquet_dir.joinpath(filepath) + ) + s3 = boto3.resource("s3") bucket = s3.Bucket(s3_bucket) if s3key is None: s3key = Path(s3_prefix_output).joinpath(filepath).as_posix() @@ -634,95 +751,120 @@ def upload_file(filepath, s3key=None): if buildstock_csv_filename is not None: buildstock_csv_filepath = Path(buildstock_csv_filename) if buildstock_csv_filepath.exists(): - tasks.append(dask.delayed(upload_file)( - buildstock_csv_filepath, - f"{s3_prefix_output}buildstock_csv/{buildstock_csv_filepath.name}" - )) + tasks.append( + dask.delayed(upload_file)( + buildstock_csv_filepath, + f"{s3_prefix_output}buildstock_csv/{buildstock_csv_filepath.name}", + ) + ) else: logger.warning(f"{buildstock_csv_filename} doesn't exist, can't upload.") dask.compute(tasks) - logger.info(f"Upload to S3 completed. The files are uploaded to: {s3_bucket}/{s3_prefix_output}") + logger.info( + f"Upload to S3 completed. The files are uploaded to: {s3_bucket}/{s3_prefix_output}" + ) return s3_bucket, s3_prefix_output def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): logger.info("Creating Athena tables using glue crawler") - region_name = aws_conf.get('region_name', 'us-west-2') - db_name = aws_conf.get('athena', {}).get('database_name', None) - role = aws_conf.get('athena', {}).get('glue_service_role', 'service-role/AWSGlueServiceRole-default') - max_crawling_time = aws_conf.get('athena', {}).get('max_crawling_time', 600) + region_name = aws_conf.get("region_name", "us-west-2") + db_name = aws_conf.get("athena", {}).get("database_name", None) + role = aws_conf.get("athena", {}).get( + "glue_service_role", "service-role/AWSGlueServiceRole-default" + ) + max_crawling_time = aws_conf.get("athena", {}).get("max_crawling_time", 600) assert db_name, "athena:database_name not supplied" # Check that there are files in the s3 bucket before creating and running glue crawler - s3 = boto3.resource('s3') + s3 = boto3.resource("s3") bucket = s3.Bucket(s3_bucket) - s3_path = f's3://{s3_bucket}/{s3_prefix}' + s3_path = f"s3://{s3_bucket}/{s3_prefix}" n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix))) if n_existing_files == 0: - logger.warning(f"There are no files in {s3_path}, Athena tables will not be created as intended") + logger.warning( + f"There are no files in {s3_path}, Athena tables will not be created as intended" + ) return - glueClient = boto3.client('glue', region_name=region_name) + glueClient = boto3.client("glue", region_name=region_name) crawlTarget = { - 'S3Targets': [{ - 'Path': s3_path, - 'Exclusions': ['**_metadata', '**_common_metadata'] - }] + "S3Targets": [ + {"Path": s3_path, "Exclusions": ["**_metadata", "**_common_metadata"]} + ] } - crawler_name = db_name + '_' + tbl_prefix - tbl_prefix = tbl_prefix + '_' + crawler_name = db_name + "_" + tbl_prefix + tbl_prefix = tbl_prefix + "_" def create_crawler(): - glueClient.create_crawler(Name=crawler_name, - Role=role, - Targets=crawlTarget, - DatabaseName=db_name, - TablePrefix=tbl_prefix) + glueClient.create_crawler( + Name=crawler_name, + Role=role, + Targets=crawlTarget, + DatabaseName=db_name, + TablePrefix=tbl_prefix, + ) try: create_crawler() except glueClient.exceptions.AlreadyExistsException: logger.info(f"Deleting existing crawler: {crawler_name}. And creating new one.") glueClient.delete_crawler(Name=crawler_name) - time.sleep(1) # A small delay after deleting is required to prevent AlreadyExistsException again + time.sleep( + 1 + ) # A small delay after deleting is required to prevent AlreadyExistsException again create_crawler() try: - existing_tables = [x['Name'] for x in glueClient.get_tables(DatabaseName=db_name)['TableList']] + existing_tables = [ + x["Name"] for x in glueClient.get_tables(DatabaseName=db_name)["TableList"] + ] except glueClient.exceptions.EntityNotFoundException: existing_tables = [] to_be_deleted_tables = [x for x in existing_tables if x.startswith(tbl_prefix)] if to_be_deleted_tables: - logger.info(f"Deleting existing tables in db {db_name}: {to_be_deleted_tables}. And creating new ones.") - glueClient.batch_delete_table(DatabaseName=db_name, TablesToDelete=to_be_deleted_tables) + logger.info( + f"Deleting existing tables in db {db_name}: {to_be_deleted_tables}. And creating new ones." + ) + glueClient.batch_delete_table( + DatabaseName=db_name, TablesToDelete=to_be_deleted_tables + ) glueClient.start_crawler(Name=crawler_name) logger.info("Crawler started") is_crawler_running = True t = time.time() while time.time() - t < (3 * max_crawling_time): - crawler_state = glueClient.get_crawler(Name=crawler_name)['Crawler']['State'] - metrics = glueClient.get_crawler_metrics(CrawlerNameList=[crawler_name])['CrawlerMetricsList'][0] - if is_crawler_running and crawler_state != 'RUNNING': + crawler_state = glueClient.get_crawler(Name=crawler_name)["Crawler"]["State"] + metrics = glueClient.get_crawler_metrics(CrawlerNameList=[crawler_name])[ + "CrawlerMetricsList" + ][0] + if is_crawler_running and crawler_state != "RUNNING": is_crawler_running = False logger.info(f"Crawler has completed running. It is {crawler_state}.") - logger.info(f"TablesCreated: {metrics['TablesCreated']} " - f"TablesUpdated: {metrics['TablesUpdated']} " - f"TablesDeleted: {metrics['TablesDeleted']} ") - if crawler_state == 'READY': + logger.info( + f"TablesCreated: {metrics['TablesCreated']} " + f"TablesUpdated: {metrics['TablesUpdated']} " + f"TablesDeleted: {metrics['TablesDeleted']} " + ) + if crawler_state == "READY": logger.info("Crawler stopped. Deleting it now.") glueClient.delete_crawler(Name=crawler_name) break elif time.time() - t > max_crawling_time: logger.info("Crawler is taking too long. Aborting ...") - logger.info(f"TablesCreated: {metrics['TablesCreated']} " - f"TablesUpdated: {metrics['TablesUpdated']} " - f"TablesDeleted: {metrics['TablesDeleted']} ") + logger.info( + f"TablesCreated: {metrics['TablesCreated']} " + f"TablesUpdated: {metrics['TablesUpdated']} " + f"TablesDeleted: {metrics['TablesDeleted']} " + ) glueClient.stop_crawler(Name=crawler_name) elif time.time() - t > 2 * max_crawling_time: - logger.warning(f"Crawler could not be stopped and deleted. Please delete the crawler {crawler_name} " - f"manually from the AWS console") + logger.warning( + f"Crawler could not be stopped and deleted. Please delete the crawler {crawler_name} " + f"manually from the AWS console" + ) break time.sleep(30) diff --git a/buildstockbatch/sampler/__init__.py b/buildstockbatch/sampler/__init__.py index 1cb55992..f821ad37 100644 --- a/buildstockbatch/sampler/__init__.py +++ b/buildstockbatch/sampler/__init__.py @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- -from .residential_quota import ResidentialQuotaSampler, ResidentialQuotaDownselectSampler # noqa F041 +from .residential_quota import ( + ResidentialQuotaSampler, + ResidentialQuotaDownselectSampler, +) # noqa F041 from .commercial_sobol import CommercialSobolSampler # noqa F041 from .precomputed import PrecomputedSampler # noqa F041 diff --git a/buildstockbatch/sampler/base.py b/buildstockbatch/sampler/base.py index 8ae55bed..7d460246 100644 --- a/buildstockbatch/sampler/base.py +++ b/buildstockbatch/sampler/base.py @@ -47,11 +47,20 @@ def __init__(self, parent): :param parent: The BuildStockBatchBase object that owns this sampler. """ - self.parent = weakref.ref(parent) # This removes circular references and allows garbage collection to work. - if self.container_runtime in (ContainerRuntime.DOCKER, ContainerRuntime.LOCAL_OPENSTUDIO): - self.csv_path = os.path.join(self.project_dir, 'housing_characteristics', 'buildstock.csv') + self.parent = weakref.ref( + parent + ) # This removes circular references and allows garbage collection to work. + if self.container_runtime in ( + ContainerRuntime.DOCKER, + ContainerRuntime.LOCAL_OPENSTUDIO, + ): + self.csv_path = os.path.join( + self.project_dir, "housing_characteristics", "buildstock.csv" + ) elif self.container_runtime == ContainerRuntime.SINGULARITY: - self.csv_path = os.path.join(self.parent().output_dir, 'housing_characteristics', 'buildstock.csv') + self.csv_path = os.path.join( + self.parent().output_dir, "housing_characteristics", "buildstock.csv" + ) else: self.csv_path = None diff --git a/buildstockbatch/sampler/commercial_sobol.py b/buildstockbatch/sampler/commercial_sobol.py index 7e1ac4b8..2cdfbb98 100644 --- a/buildstockbatch/sampler/commercial_sobol.py +++ b/buildstockbatch/sampler/commercial_sobol.py @@ -41,26 +41,32 @@ def __init__(self, parent, n_datapoints): super().__init__(parent) self.validate_args(self.parent().project_filename, n_datapoints=n_datapoints) if self.container_runtime == ContainerRuntime.SINGULARITY: - self.csv_path = os.path.join(self.output_dir, 'buildstock.csv') + self.csv_path = os.path.join(self.output_dir, "buildstock.csv") else: - assert self.container_runtime in (ContainerRuntime.DOCKER, ContainerRuntime.LOCAL_OPENSTUDIO) - self.csv_path = os.path.join(self.project_dir, 'buildstock.csv') + assert self.container_runtime in ( + ContainerRuntime.DOCKER, + ContainerRuntime.LOCAL_OPENSTUDIO, + ) + self.csv_path = os.path.join(self.project_dir, "buildstock.csv") self.n_datapoints = n_datapoints @classmethod def validate_args(cls, project_filename, **kw): - expected_args = set(['n_datapoints']) + expected_args = set(["n_datapoints"]) for k, v in kw.items(): expected_args.discard(k) - if k == 'n_datapoints': + if k == "n_datapoints": if not isinstance(v, int): - raise ValidationError('n_datapoints needs to be an integer') + raise ValidationError("n_datapoints needs to be an integer") if v <= 0: - raise ValidationError('n_datapoints need to be >= 1') + raise ValidationError("n_datapoints need to be >= 1") else: - raise ValidationError(f'Unknown argument for sampler: {k}') + raise ValidationError(f"Unknown argument for sampler: {k}") if len(expected_args) > 0: - raise ValidationError('The following sampler arguments are required: ' + ', '.join(expected_args)) + raise ValidationError( + "The following sampler arguments are required: " + + ", ".join(expected_args) + ) return True def run_sampling(self): @@ -74,33 +80,44 @@ def run_sampling(self): :param n_datapoints: Number of datapoints to sample from the distributions. :return: Absolute path to the output buildstock.csv file """ - sample_number = self.cfg['baseline'].get('n_datapoints', 350000) + sample_number = self.cfg["baseline"].get("n_datapoints", 350000) if isinstance(self.n_datapoints, int): sample_number = self.n_datapoints - logging.debug(f'Sampling, number of data points is {sample_number}') + logging.debug(f"Sampling, number of data points is {sample_number}") tsv_hash = {} for tsv_file in os.listdir(self.buildstock_dir): - if '.tsv' in tsv_file: - tsv_df = read_csv(os.path.join(self.buildstock_dir, tsv_file), sep='\t') - dependency_columns = [item for item in list(tsv_df) if 'Dependency=' in item] - tsv_df[dependency_columns] = tsv_df[dependency_columns].astype('str') - tsv_hash[tsv_file.replace('.tsv', '')] = tsv_df + if ".tsv" in tsv_file: + tsv_df = read_csv(os.path.join(self.buildstock_dir, tsv_file), sep="\t") + dependency_columns = [ + item for item in list(tsv_df) if "Dependency=" in item + ] + tsv_df[dependency_columns] = tsv_df[dependency_columns].astype("str") + tsv_hash[tsv_file.replace(".tsv", "")] = tsv_df dependency_hash, attr_order = self._com_order_tsvs(tsv_hash) - sample_matrix = self._com_execute_sobol_sampling(attr_order.__len__(), sample_number) + sample_matrix = self._com_execute_sobol_sampling( + attr_order.__len__(), sample_number + ) csv_path = self.csv_path - header = 'Building,' + header = "Building," for item in attr_order: - header += str(item) + ',' - header = header[0:-1] + '\n' - with open(csv_path, 'w') as fd: + header += str(item) + "," + header = header[0:-1] + "\n" + with open(csv_path, "w") as fd: fd.write(header) manager = Manager() lock = manager.Lock() - logger.info('Beginning sampling process') + logger.info("Beginning sampling process") n_jobs = cpu_count() * 2 Parallel(n_jobs=n_jobs, verbose=5)( - delayed(self._com_execute_sample)(tsv_hash, dependency_hash, attr_order, sample_matrix, index, csv_path, - lock) + delayed(self._com_execute_sample)( + tsv_hash, + dependency_hash, + attr_order, + sample_matrix, + index, + csv_path, + lock, + ) for index in range(sample_number) ) return csv_path @@ -115,7 +132,9 @@ def _com_execute_sobol_sampling(n_dims, n_samples): :param n_samples: Number of samples to calculate :return: Pandas DataFrame object which contains the low discrepancy result of the sobol algorithm """ - return pd.DataFrame(i4_sobol_generate(n_dims, n_samples, 0)).replace(1.0, 0.999999) + return pd.DataFrame(i4_sobol_generate(n_dims, n_samples, 0)).replace( + 1.0, 0.999999 + ) @staticmethod def _com_order_tsvs(tsv_hash): @@ -127,8 +146,11 @@ def _com_order_tsvs(tsv_hash): """ dependency_hash = {} for attr in tsv_hash.keys(): - dependency_hash[attr] = [item.replace('Dependency=', '') for item in list(tsv_hash[attr]) if - 'Dependency=' in item] + dependency_hash[attr] = [ + item.replace("Dependency=", "") + for item in list(tsv_hash[attr]) + if "Dependency=" in item + ] attr_order = [] for attr in dependency_hash.keys(): if dependency_hash[attr]: @@ -149,11 +171,21 @@ def _com_order_tsvs(tsv_hash): elif max_iterations > 0: max_iterations -= 1 else: - raise RuntimeError('Unable to resolve the dependency tree within the set iteration limit') + raise RuntimeError( + "Unable to resolve the dependency tree within the set iteration limit" + ) return dependency_hash, attr_order @staticmethod - def _com_execute_sample(tsv_hash, dependency_hash, attr_order, sample_matrix, sample_index, csv_path, lock): + def _com_execute_sample( + tsv_hash, + dependency_hash, + attr_order, + sample_matrix, + sample_index, + csv_path, + lock, + ): """ This function evaluates a single point in the sample matrix with the provided TSV files & persists the result\ of the sample to the CSV file specified. The provided lock ensures the file is not corrupted by multiple\ @@ -174,27 +206,40 @@ def _com_execute_sample(tsv_hash, dependency_hash, attr_order, sample_matrix, sa tsv_lkup = tsv_hash[attr] tsv_dist_val = sample_vector[attr_index] for dependency in sample_dependency_hash[attr]: - tsv_lkup = tsv_lkup.loc[tsv_lkup.loc[:, 'Dependency=' + dependency] == - sample_dependency_hash[dependency]] - tsv_lkup = tsv_lkup.drop('Dependency=' + dependency, axis=1) + tsv_lkup = tsv_lkup.loc[ + tsv_lkup.loc[:, "Dependency=" + dependency] + == sample_dependency_hash[dependency] + ] + tsv_lkup = tsv_lkup.drop("Dependency=" + dependency, axis=1) if tsv_lkup.shape[0] == 0: - warn('TSV lookup reduced to 0 for {}, index {}, dep hash {}'.format(attr, sample_index, - sample_dependency_hash)) + warn( + "TSV lookup reduced to 0 for {}, index {}, dep hash {}".format( + attr, sample_index, sample_dependency_hash + ) + ) return if tsv_lkup.shape[0] != 1: - raise RuntimeError('Unable to reduce tsv for {} to 1 row, index {}'.format(attr, sample_index)) + raise RuntimeError( + "Unable to reduce tsv for {} to 1 row, index {}".format( + attr, sample_index + ) + ) tsv_lkup_cdf = tsv_lkup.values.cumsum() > tsv_dist_val - option_values = [item.replace('Option=', '') for item in list(tsv_lkup) if 'Option=' in item] + option_values = [ + item.replace("Option=", "") + for item in list(tsv_lkup) + if "Option=" in item + ] attr_result = list(compress(option_values, tsv_lkup_cdf))[0] sample_dependency_hash[attr] = attr_result result_vector.append(attr_result) - csv_row = str(sample_index + 1) + ',' + csv_row = str(sample_index + 1) + "," for item in result_vector: - csv_row += str(item) + ',' - csv_row = csv_row[0:-1] + '\n' + csv_row += str(item) + "," + csv_row = csv_row[0:-1] + "\n" lock.acquire() try: - with open(csv_path, 'a') as fd: + with open(csv_path, "a") as fd: fd.write(csv_row) finally: lock.release() diff --git a/buildstockbatch/sampler/downselect.py b/buildstockbatch/sampler/downselect.py index e71578c1..9390d7a4 100644 --- a/buildstockbatch/sampler/downselect.py +++ b/buildstockbatch/sampler/downselect.py @@ -7,6 +7,7 @@ :copyright: (c) 2020 by The Alliance for Sustainable Energy :license: BSD-3 """ + import gzip import logging import math @@ -58,39 +59,42 @@ def __init__(self, parent, n_datapoints, logic, resample=True, **kw): @classmethod def validate_args(cls, project_filename, **kw): - expected_args = set(['logic']) + expected_args = set(["logic"]) extra_kw = {} for k, v in kw.items(): expected_args.discard(k) - if k == 'logic': + if k == "logic": # TODO: do some validation of the logic here. pass - elif k == 'resample': + elif k == "resample": pass else: extra_kw[k] = v if len(expected_args) > 0: - raise ValidationError('The following sampler arguments are required: ' + ', '.join(expected_args)) + raise ValidationError( + "The following sampler arguments are required: " + + ", ".join(expected_args) + ) cls.SUB_SAMPLER_CLASS.validate_args(project_filename, **extra_kw) return True @classmethod def downselect_logic(cls, df, logic): if isinstance(logic, dict): - assert (len(logic) == 1) + assert len(logic) == 1 key = list(logic.keys())[0] values = logic[key] - if key == 'and': + if key == "and": retval = cls.downselect_logic(df, values[0]) for value in values[1:]: retval &= cls.downselect_logic(df, value) return retval - elif key == 'or': + elif key == "or": retval = cls.downselect_logic(df, values[0]) for value in values[1:]: retval |= cls.downselect_logic(df, value) return retval - elif key == 'not': + elif key == "not": return ~cls.downselect_logic(df, values) elif isinstance(logic, list): retval = cls.downselect_logic(df, logic[0]) @@ -98,32 +102,42 @@ def downselect_logic(cls, df, logic): retval &= cls.downselect_logic(df, value) return retval elif isinstance(logic, str): - key, value = logic.split('|') + key, value = logic.split("|") return df[key] == value def run_sampling(self): if self.resample: - logger.debug('Performing initial sampling to figure out number of samples for downselect') + logger.debug( + "Performing initial sampling to figure out number of samples for downselect" + ) n_samples_init = 350000 - init_sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples_init, **self.sub_kw) + init_sampler = self.SUB_SAMPLER_CLASS( + self.parent(), n_datapoints=n_samples_init, **self.sub_kw + ) buildstock_csv_filename = init_sampler.run_sampling() df = read_csv(buildstock_csv_filename, index_col=0, dtype=str) df_new = df[self.downselect_logic(df, self.logic)] downselected_n_samples_init = df_new.shape[0] - n_samples = math.ceil(self.n_datapoints * n_samples_init / downselected_n_samples_init) + n_samples = math.ceil( + self.n_datapoints * n_samples_init / downselected_n_samples_init + ) os.remove(buildstock_csv_filename) del init_sampler else: n_samples = self.n_datapoints - sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples, **self.sub_kw) + sampler = self.SUB_SAMPLER_CLASS( + self.parent(), n_datapoints=n_samples, **self.sub_kw + ) buildstock_csv_filename = sampler.run_sampling() - with gzip.open(os.path.splitext(buildstock_csv_filename)[0] + '_orig.csv.gz', 'wb') as f_out: - with open(buildstock_csv_filename, 'rb') as f_in: + with gzip.open( + os.path.splitext(buildstock_csv_filename)[0] + "_orig.csv.gz", "wb" + ) as f_out: + with open(buildstock_csv_filename, "rb") as f_in: shutil.copyfileobj(f_in, f_out) - df = read_csv(buildstock_csv_filename, index_col=0, dtype='str') + df = read_csv(buildstock_csv_filename, index_col=0, dtype="str") df_new = df[self.downselect_logic(df, self.logic)] if len(df_new.index) == 0: - raise RuntimeError('There are no buildings left after the down select!') + raise RuntimeError("There are no buildings left after the down select!") if self.resample: old_index_name = df_new.index.name df_new.index = np.arange(len(df_new)) + 1 diff --git a/buildstockbatch/sampler/precomputed.py b/buildstockbatch/sampler/precomputed.py index f42ca8ad..f1ae7a00 100644 --- a/buildstockbatch/sampler/precomputed.py +++ b/buildstockbatch/sampler/precomputed.py @@ -38,16 +38,16 @@ def __init__(self, parent, sample_file): @classmethod def validate_args(cls, project_filename, **kw): - expected_args = set(['sample_file']) + expected_args = set(["sample_file"]) for k, v in kw.items(): expected_args.discard(k) - if k == 'sample_file': + if k == "sample_file": if not isinstance(v, str): - raise ValidationError('sample_file should be a path string') + raise ValidationError("sample_file should be a path string") if not os.path.exists(path_rel_to_file(project_filename, v)): - raise ValidationError(f'sample_file doesn\'t exist: {v}') + raise ValidationError(f"sample_file doesn't exist: {v}") else: - raise ValidationError(f'Unknown argument for sampler: {k}') + raise ValidationError(f"Unknown argument for sampler: {k}") return True def run_sampling(self): diff --git a/buildstockbatch/sampler/residential_quota.py b/buildstockbatch/sampler/residential_quota.py index 9789327a..c94f264a 100644 --- a/buildstockbatch/sampler/residential_quota.py +++ b/buildstockbatch/sampler/residential_quota.py @@ -7,6 +7,7 @@ :copyright: (c) 2020 by The Alliance for Sustainable Energy :license: BSD-3 """ + import docker import logging import os @@ -39,68 +40,79 @@ def __init__(self, parent, n_datapoints): @classmethod def validate_args(cls, project_filename, **kw): - expected_args = set(['n_datapoints']) + expected_args = set(["n_datapoints"]) for k, v in kw.items(): expected_args.discard(k) - if k == 'n_datapoints': + if k == "n_datapoints": if not isinstance(v, int): - raise ValidationError('n_datapoints needs to be an integer') + raise ValidationError("n_datapoints needs to be an integer") if v <= 0: - raise ValidationError('n_datapoints need to be >= 1') + raise ValidationError("n_datapoints need to be >= 1") else: - raise ValidationError(f'Unknown argument for sampler: {k}') + raise ValidationError(f"Unknown argument for sampler: {k}") if len(expected_args) > 0: - raise ValidationError('The following sampler arguments are required: ' + ', '.join(expected_args)) + raise ValidationError( + "The following sampler arguments are required: " + + ", ".join(expected_args) + ) return True def _run_sampling_docker(self): docker_client = docker.DockerClient.from_env() tick = time.time() extra_kws = {} - if sys.platform.startswith('linux'): - extra_kws['user'] = f'{os.getuid()}:{os.getgid()}' + if sys.platform.startswith("linux"): + extra_kws["user"] = f"{os.getuid()}:{os.getgid()}" container_output = docker_client.containers.run( self.parent().docker_image, [ - 'ruby', - 'resources/run_sampling.rb', - '-p', self.cfg['project_directory'], - '-n', str(self.n_datapoints), - '-o', 'buildstock.csv' + "ruby", + "resources/run_sampling.rb", + "-p", + self.cfg["project_directory"], + "-n", + str(self.n_datapoints), + "-o", + "buildstock.csv", ], remove=True, volumes={ - self.buildstock_dir: {'bind': '/var/simdata/openstudio', 'mode': 'rw'} + self.buildstock_dir: {"bind": "/var/simdata/openstudio", "mode": "rw"} }, - name='buildstock_sampling', - **extra_kws + name="buildstock_sampling", + **extra_kws, ) tick = time.time() - tick - for line in container_output.decode('utf-8').split('\n'): + for line in container_output.decode("utf-8").split("\n"): logger.debug(line) - logger.debug('Sampling took {:.1f} seconds'.format(tick)) + logger.debug("Sampling took {:.1f} seconds".format(tick)) destination_filename = self.csv_path if os.path.exists(destination_filename): os.remove(destination_filename) shutil.move( - os.path.join(self.buildstock_dir, 'resources', 'buildstock.csv'), - destination_filename + os.path.join(self.buildstock_dir, "resources", "buildstock.csv"), + destination_filename, ) return destination_filename def _run_sampling_singularity(self): args = [ - 'singularity', - 'exec', - '--contain', - '--home', '{}:/buildstock'.format(self.buildstock_dir), - '--bind', '{}:/outbind'.format(os.path.dirname(self.csv_path)), + "singularity", + "exec", + "--contain", + "--home", + "{}:/buildstock".format(self.buildstock_dir), + "--bind", + "{}:/outbind".format(os.path.dirname(self.csv_path)), self.parent().singularity_image, - 'ruby', - 'resources/run_sampling.rb', - '-p', self.cfg['project_directory'], - '-n', str(self.n_datapoints), - '-o', '../../outbind/{}'.format(os.path.basename(self.csv_path)) + "ruby", + "resources/run_sampling.rb", + "-p", + self.cfg["project_directory"], + "-n", + str(self.n_datapoints), + "-o", + "../../outbind/{}".format(os.path.basename(self.csv_path)), ] logger.debug(f"Starting singularity sampling with command: {' '.join(args)}") subprocess.run(args, check=True, env=os.environ, cwd=self.parent().output_dir) @@ -111,20 +123,23 @@ def _run_sampling_local_openstudio(self): subprocess.run( [ self.parent().openstudio_exe(), - str(pathlib.Path('resources', 'run_sampling.rb')), - '-p', self.cfg['project_directory'], - '-n', str(self.n_datapoints), - '-o', 'buildstock.csv' + str(pathlib.Path("resources", "run_sampling.rb")), + "-p", + self.cfg["project_directory"], + "-n", + str(self.n_datapoints), + "-o", + "buildstock.csv", ], cwd=self.buildstock_dir, - check=True + check=True, ) destination_filename = pathlib.Path(self.csv_path) if destination_filename.exists(): os.remove(destination_filename) shutil.move( - pathlib.Path(self.buildstock_dir, 'resources', 'buildstock.csv'), - destination_filename + pathlib.Path(self.buildstock_dir, "resources", "buildstock.csv"), + destination_filename, ) return destination_filename diff --git a/buildstockbatch/sampler/sobol_lib.py b/buildstockbatch/sampler/sobol_lib.py index 8a015c3b..01dd21fb 100644 --- a/buildstockbatch/sampler/sobol_lib.py +++ b/buildstockbatch/sampler/sobol_lib.py @@ -58,11 +58,11 @@ def i4_bit_hi1(n): # i = int(n) bit = 0 - while (True): - if (i <= 0): + while True: + if i <= 0: break bit += 1 - i = (i // 2) + i = i // 2 return bit @@ -119,10 +119,10 @@ def i4_bit_lo0(n): # bit = 0 i = int(n) - while (1): + while 1: bit = bit + 1 - i2 = (i // 2) - if (i == 2 * i2): + i2 = i // 2 + if i == 2 * i2: break i = i2 @@ -242,162 +242,424 @@ def i4_sobol(dim_num, seed): global seed_save global v - if (not 'initialized' in globals().keys()): + if not "initialized" in globals().keys(): initialized = 0 dim_num_save = -1 - if (not initialized or dim_num != dim_num_save): + if not initialized or dim_num != dim_num_save: initialized = 1 dim_max = 40 dim_num_save = -1 log_max = 30 seed_save = -1 # - # Initialize (part of) V. + # Initialize (part of) V. # v = zeros((dim_max, log_max)) - v[0:40, 0] = transpose([ \ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) - - v[2:40, 1] = transpose([ \ - 1, 3, 1, 3, 1, 3, 3, 1, \ - 3, 1, 3, 1, 3, 1, 1, 3, 1, 3, \ - 1, 3, 1, 3, 3, 1, 3, 1, 3, 1, \ - 3, 1, 1, 3, 1, 3, 1, 3, 1, 3]) - - v[3:40, 2] = transpose([ \ - 7, 5, 1, 3, 3, 7, 5, \ - 5, 7, 7, 1, 3, 3, 7, 5, 1, 1, \ - 5, 3, 3, 1, 7, 5, 1, 3, 3, 7, \ - 5, 1, 1, 5, 7, 7, 5, 1, 3, 3]) - - v[5:40, 3] = transpose([ \ - 1, 7, 9, 13, 11, \ - 1, 3, 7, 9, 5, 13, 13, 11, 3, 15, \ - 5, 3, 15, 7, 9, 13, 9, 1, 11, 7, \ - 5, 15, 1, 15, 11, 5, 3, 1, 7, 9]) - - v[7:40, 4] = transpose([ \ - 9, 3, 27, \ - 15, 29, 21, 23, 19, 11, 25, 7, 13, 17, \ - 1, 25, 29, 3, 31, 11, 5, 23, 27, 19, \ - 21, 5, 1, 17, 13, 7, 15, 9, 31, 9]) - - v[13:40, 5] = transpose([ \ - 37, 33, 7, 5, 11, 39, 63, \ - 27, 17, 15, 23, 29, 3, 21, 13, 31, 25, \ - 9, 49, 33, 19, 29, 11, 19, 27, 15, 25]) - - v[19:40, 6] = transpose([ \ - 13, \ - 33, 115, 41, 79, 17, 29, 119, 75, 73, 105, \ - 7, 59, 65, 21, 3, 113, 61, 89, 45, 107]) - - v[37:40, 7] = transpose([ \ - 7, 23, 39]) + v[0:40, 0] = transpose( + [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + ] + ) + + v[2:40, 1] = transpose( + [ + 1, + 3, + 1, + 3, + 1, + 3, + 3, + 1, + 3, + 1, + 3, + 1, + 3, + 1, + 1, + 3, + 1, + 3, + 1, + 3, + 1, + 3, + 3, + 1, + 3, + 1, + 3, + 1, + 3, + 1, + 1, + 3, + 1, + 3, + 1, + 3, + 1, + 3, + ] + ) + + v[3:40, 2] = transpose( + [ + 7, + 5, + 1, + 3, + 3, + 7, + 5, + 5, + 7, + 7, + 1, + 3, + 3, + 7, + 5, + 1, + 1, + 5, + 3, + 3, + 1, + 7, + 5, + 1, + 3, + 3, + 7, + 5, + 1, + 1, + 5, + 7, + 7, + 5, + 1, + 3, + 3, + ] + ) + + v[5:40, 3] = transpose( + [ + 1, + 7, + 9, + 13, + 11, + 1, + 3, + 7, + 9, + 5, + 13, + 13, + 11, + 3, + 15, + 5, + 3, + 15, + 7, + 9, + 13, + 9, + 1, + 11, + 7, + 5, + 15, + 1, + 15, + 11, + 5, + 3, + 1, + 7, + 9, + ] + ) + + v[7:40, 4] = transpose( + [ + 9, + 3, + 27, + 15, + 29, + 21, + 23, + 19, + 11, + 25, + 7, + 13, + 17, + 1, + 25, + 29, + 3, + 31, + 11, + 5, + 23, + 27, + 19, + 21, + 5, + 1, + 17, + 13, + 7, + 15, + 9, + 31, + 9, + ] + ) + + v[13:40, 5] = transpose( + [ + 37, + 33, + 7, + 5, + 11, + 39, + 63, + 27, + 17, + 15, + 23, + 29, + 3, + 21, + 13, + 31, + 25, + 9, + 49, + 33, + 19, + 29, + 11, + 19, + 27, + 15, + 25, + ] + ) + + v[19:40, 6] = transpose( + [ + 13, + 33, + 115, + 41, + 79, + 17, + 29, + 119, + 75, + 73, + 105, + 7, + 59, + 65, + 21, + 3, + 113, + 61, + 89, + 45, + 107, + ] + ) + + v[37:40, 7] = transpose([7, 23, 39]) # - # Set POLY. + # Set POLY. # - poly = [ \ - 1, 3, 7, 11, 13, 19, 25, 37, 59, 47, \ - 61, 55, 41, 67, 97, 91, 109, 103, 115, 131, \ - 193, 137, 145, 143, 241, 157, 185, 167, 229, 171, \ - 213, 191, 253, 203, 211, 239, 247, 285, 369, 299] - - atmost = 2 ** log_max - 1 + poly = [ + 1, + 3, + 7, + 11, + 13, + 19, + 25, + 37, + 59, + 47, + 61, + 55, + 41, + 67, + 97, + 91, + 109, + 103, + 115, + 131, + 193, + 137, + 145, + 143, + 241, + 157, + 185, + 167, + 229, + 171, + 213, + 191, + 253, + 203, + 211, + 239, + 247, + 285, + 369, + 299, + ] + + atmost = 2**log_max - 1 # - # Find the number of bits in ATMOST. + # Find the number of bits in ATMOST. # maxcol = i4_bit_hi1(atmost) # - # Initialize row 1 of V. + # Initialize row 1 of V. # v[0, 0:maxcol] = 1 # - # Things to do only if the dimension changed. + # Things to do only if the dimension changed. # - if (dim_num != dim_num_save): + if dim_num != dim_num_save: # - # Check parameters. + # Check parameters. # - if (dim_num < 1 or dim_max < dim_num): - print('I4_SOBOL - Fatal error!') - print(' The spatial dimension DIM_NUM should satisfy:') - print(' 1 <= DIM_NUM <= %d' % dim_max) - print(' But this input value is DIM_NUM = %d' % dim_num) + if dim_num < 1 or dim_max < dim_num: + print("I4_SOBOL - Fatal error!") + print(" The spatial dimension DIM_NUM should satisfy:") + print(" 1 <= DIM_NUM <= %d" % dim_max) + print(" But this input value is DIM_NUM = %d" % dim_num) return dim_num_save = dim_num # - # Initialize the remaining rows of V. + # Initialize the remaining rows of V. # for i in range(2, dim_num + 1): # - # The bits of the integer POLY(I) gives the form of polynomial I. + # The bits of the integer POLY(I) gives the form of polynomial I. # - # Find the degree of polynomial I from binary encoding. + # Find the degree of polynomial I from binary encoding. # j = poly[i - 1] m = 0 - while (1): - j = math.floor(j / 2.) - if (j <= 0): + while 1: + j = math.floor(j / 2.0) + if j <= 0: break m = m + 1 # - # Expand this bit pattern to separate components of the logical array INCLUD. + # Expand this bit pattern to separate components of the logical array INCLUD. # j = poly[i - 1] includ = zeros(m) for k in range(m, 0, -1): - j2 = math.floor(j / 2.) - includ[k - 1] = (j != 2 * j2) + j2 = math.floor(j / 2.0) + includ[k - 1] = j != 2 * j2 j = j2 # - # Calculate the remaining elements of row I as explained - # in Bratley and Fox, section 2. + # Calculate the remaining elements of row I as explained + # in Bratley and Fox, section 2. # for j in range(m + 1, maxcol + 1): newv = v[i - 1, j - m - 1] l = 1 for k in range(1, m + 1): l = 2 * l - if (includ[k - 1]): + if includ[k - 1]: newv = bitwise_xor(int(newv), int(l * v[i - 1, j - k - 1])) v[i - 1, j - 1] = newv # - # Multiply columns of V by appropriate power of 2. + # Multiply columns of V by appropriate power of 2. # l = 1 for j in range(maxcol - 1, 0, -1): l = 2 * l v[0:dim_num, j - 1] = v[0:dim_num, j - 1] * l # - # RECIPD is 1/(common denominator of the elements in V). + # RECIPD is 1/(common denominator of the elements in V). # recipd = 1.0 / (2 * l) lastq = zeros(dim_num) seed = int(math.floor(seed)) - if (seed < 0): + if seed < 0: seed = 0 - if (seed == 0): + if seed == 0: l = 1 lastq = zeros(dim_num) - elif (seed == seed_save + 1): + elif seed == seed_save + 1: # - # Find the position of the right-hand zero in SEED. + # Find the position of the right-hand zero in SEED. # l = i4_bit_lo0(seed) - elif (seed <= seed_save): + elif seed <= seed_save: seed_save = 0 l = 1 @@ -410,7 +672,7 @@ def i4_sobol(dim_num, seed): l = i4_bit_lo0(seed) - elif (seed_save + 1 < seed): + elif seed_save + 1 < seed: for seed_temp in range(int(seed_save + 1), int(seed)): l = i4_bit_lo0(seed_temp) @@ -419,16 +681,16 @@ def i4_sobol(dim_num, seed): l = i4_bit_lo0(seed) # - # Check that the user is not calling too many times! + # Check that the user is not calling too many times! # - if (maxcol < l): - print('I4_SOBOL - Fatal error!') - print(' Too many calls!') - print(' MAXCOL = %d\n' % maxcol) - print(' L = %d\n' % l) + if maxcol < l: + print("I4_SOBOL - Fatal error!") + print(" Too many calls!") + print(" MAXCOL = %d\n" % maxcol) + print(" L = %d\n" % l) return # - # Calculate the new components of QUASI. + # Calculate the new components of QUASI. # quasi = zeros(dim_num) for i in range(1, dim_num + 1): @@ -498,9 +760,9 @@ def i4_uniform(a, b, seed): # # Output, integer SEED, the updated seed. # - if (seed == 0): - print('I4_UNIFORM - Fatal error!') - print(' Input SEED = 0!') + if seed == 0: + print("I4_UNIFORM - Fatal error!") + print(" Input SEED = 0!") seed = math.floor(seed) a = round(a) @@ -508,23 +770,23 @@ def i4_uniform(a, b, seed): seed = mod(seed, 2147483647) - if (seed < 0): + if seed < 0: seed = seed + 2147483647 k = math.floor(seed / 127773) seed = 16807 * (seed - k * 127773) - k * 2836 - if (seed < 0): + if seed < 0: seed = seed + 2147483647 - r = seed * 4.656612875E-10 + r = seed * 4.656612875e-10 # - # Scale R to lie between A-0.5 and B+0.5. + # Scale R to lie between A-0.5 and B+0.5. # r = (1.0 - r) * (min(a, b) - 0.5) + r * (max(a, b) + 0.5) # - # Use rounding to convert R to an integer between A and B. + # Use rounding to convert R to an integer between A and B. # value = round(r) @@ -578,7 +840,7 @@ def prime_ge(n): # than or equal to N. # p = max(math.ceil(n), 2) - while (not isprime(p)): + while not isprime(p): p = p + 1 return p diff --git a/buildstockbatch/test/conftest.py b/buildstockbatch/test/conftest.py index c1e9e460..54a50d37 100644 --- a/buildstockbatch/test/conftest.py +++ b/buildstockbatch/test/conftest.py @@ -5,68 +5,78 @@ import yaml from pathlib import Path -OUTPUT_FOLDER_NAME = 'output' +OUTPUT_FOLDER_NAME = "output" @pytest.fixture def basic_residential_project_file(): with tempfile.TemporaryDirectory() as test_directory: + def _basic_residential_project_file(update_args={}, raw=False): output_dir = "simulations_job0" if raw else "simulation_output" - buildstock_directory = os.path.join(test_directory, 'openstudio_buildstock') + buildstock_directory = os.path.join(test_directory, "openstudio_buildstock") shutil.copytree( - os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_inputs', 'test_openstudio_buildstock'), - buildstock_directory + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "test_inputs", + "test_openstudio_buildstock", + ), + buildstock_directory, ) - project_directory = 'project_resstock_national' + project_directory = "project_resstock_national" os.makedirs(os.path.join(buildstock_directory, project_directory)) output_directory = os.path.join(test_directory, OUTPUT_FOLDER_NAME) shutil.copytree( - os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_results', output_dir), - os.path.join(output_directory, 'simulation_output') + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "test_results", + output_dir, + ), + os.path.join(output_directory, "simulation_output"), ) # move the job*.json file to appropriate location - if os.path.exists(os.path.join(output_directory, 'simulation_output', 'job0.json')): - shutil.move(os.path.join(output_directory, 'simulation_output', 'job0.json'), - os.path.join(output_directory, 'simulation_output', '..', '..', 'job0.json')) + if os.path.exists( + os.path.join(output_directory, "simulation_output", "job0.json") + ): + shutil.move( + os.path.join(output_directory, "simulation_output", "job0.json"), + os.path.join( + output_directory, "simulation_output", "..", "..", "job0.json" + ), + ) - os.mkdir(os.path.join(output_directory, 'housing_characteristics')) - os.mkdir(os.path.join(buildstock_directory, project_directory, 'housing_characteristics')) + os.mkdir(os.path.join(output_directory, "housing_characteristics")) + os.mkdir( + os.path.join( + buildstock_directory, project_directory, "housing_characteristics" + ) + ) cfg = { - 'buildstock_directory': buildstock_directory, - 'project_directory': project_directory, - 'output_directory': output_directory, - 'weather_files_url': 'https://s3.amazonaws.com/epwweatherfiles/project_resstock_national.zip', - 'sampler': { - 'type': 'residential_quota', - 'args': { - 'n_datapoints': 8 - } - }, - 'workflow_generator': { - 'type': 'residential_hpxml', - 'args': { + "buildstock_directory": buildstock_directory, + "project_directory": project_directory, + "output_directory": output_directory, + "weather_files_url": "https://s3.amazonaws.com/epwweatherfiles/project_resstock_national.zip", + "sampler": {"type": "residential_quota", "args": {"n_datapoints": 8}}, + "workflow_generator": { + "type": "residential_hpxml", + "args": { "build_existing_model": { "simulation_control_timestep": 60, "simulation_control_run_period_begin_month": 1, "simulation_control_run_period_begin_day_of_month": 1, "simulation_control_run_period_end_month": 12, "simulation_control_run_period_end_day_of_month": 31, - "simulation_control_run_period_calendar_year": 2007 + "simulation_control_run_period_calendar_year": 2007, }, "emissions": [ { "scenario_name": "LRMER_MidCase_15", "type": "CO2e", - "elec_folder": "data/cambium/LRMER_MidCase_15" - } - ], - "utility_bills": [ - { - "scenario_name": "Bills" + "elec_folder": "data/cambium/LRMER_MidCase_15", } ], + "utility_bills": [{"scenario_name": "Bills"}], "simulation_output_report": { "timeseries_frequency": "hourly", "include_timeseries_total_consumptions": True, @@ -81,36 +91,34 @@ def _basic_residential_project_file(update_args={}, raw=False): "include_timeseries_unmet_hours": True, "include_timeseries_zone_temperatures": True, "include_timeseries_airflows": True, - "include_timeseries_weather": True + "include_timeseries_weather": True, }, "reporting_measures": [], "server_directory_cleanup": { "retain_in_idf": False, - "retain_schedules_csv": False - } - } + "retain_schedules_csv": False, + }, + }, }, - 'baseline': { - 'n_buildings_represented': 80000000, + "baseline": { + "n_buildings_represented": 80000000, }, - 'upgrades': [{ - 'upgrade_name': 'Upgrade1', - 'options': [ - {'option': 'Infiltration|11.25 ACH50'} - ] - }], - 'eagle': { - 'sampling': { - 'time': 20 - }, - 'account': 'testaccount', - 'minutes_per_sim': 1 + "upgrades": [ + { + "upgrade_name": "Upgrade1", + "options": [{"option": "Infiltration|11.25 ACH50"}], + } + ], + "eagle": { + "sampling": {"time": 20}, + "account": "testaccount", + "minutes_per_sim": 1, }, - 'schema_version': '0.3' + "schema_version": "0.3", } cfg.update(update_args) - project_filename = os.path.join(test_directory, 'project.yml') - with open(project_filename, 'w') as f: + project_filename = os.path.join(test_directory, "project.yml") + with open(project_filename, "w") as f: yaml.dump(cfg, f) return project_filename, output_directory diff --git a/buildstockbatch/test/shared_testing_stuff.py b/buildstockbatch/test/shared_testing_stuff.py index e988f6f9..4e33ac43 100644 --- a/buildstockbatch/test/shared_testing_stuff.py +++ b/buildstockbatch/test/shared_testing_stuff.py @@ -4,9 +4,11 @@ resstock_directory = pathlib.Path( - os.environ.get("RESSTOCK_DIR", pathlib.Path(__file__).resolve().parent.parent.parent.parent / "resstock") + os.environ.get( + "RESSTOCK_DIR", + pathlib.Path(__file__).resolve().parent.parent.parent.parent / "resstock", + ) ) resstock_required = pytest.mark.skipif( - not resstock_directory.exists(), - reason="ResStock checkout is not found" + not resstock_directory.exists(), reason="ResStock checkout is not found" ) diff --git a/buildstockbatch/test/test_aws.py b/buildstockbatch/test/test_aws.py index e7f4c24d..322885e0 100644 --- a/buildstockbatch/test/test_aws.py +++ b/buildstockbatch/test/test_aws.py @@ -5,7 +5,7 @@ from buildstockbatch.aws.aws import AwsBatch here = os.path.dirname(os.path.abspath(__file__)) -logging.basicConfig(level='DEBUG') # Use DEBUG, INFO, or WARNING +logging.basicConfig(level="DEBUG") # Use DEBUG, INFO, or WARNING logger = logging.getLogger(__name__) @@ -13,40 +13,42 @@ def test_custom_gem_install(basic_residential_project_file): project_filename, results_dir = basic_residential_project_file() # Add aws and custom_gems to the project file - with open(project_filename, 'r') as f: + with open(project_filename, "r") as f: cfg = yaml.safe_load(f) # custom_gems - cfg['baseline']['custom_gems'] = True + cfg["baseline"]["custom_gems"] = True # AWS - cfg['aws'] = {} - cfg['aws']['job_identifier'] = 'testaws' - cfg['aws']['s3'] = {} - cfg['aws']['s3']['bucket'] = 'resbldg-datasets' - cfg['aws']['s3']['prefix'] = 'testing/external_demo_project' - cfg['aws']['emr'] = {} - cfg['aws']['emr']['manager_instance_type'] = 'm5.xlarge' - cfg['aws']['emr']['worker_instance_type'] = 'r5.4xlarge' - cfg['aws']['emr']['worker_instance_count'] = 1 - cfg['aws']['region'] = 'us-west-2' - cfg['aws']['use_spot'] = True - cfg['aws']['batch_array_size'] = 100 - cfg['aws']['notifications_email'] = 'user@example.com' - with open(project_filename, 'w') as f: + cfg["aws"] = {} + cfg["aws"]["job_identifier"] = "testaws" + cfg["aws"]["s3"] = {} + cfg["aws"]["s3"]["bucket"] = "resbldg-datasets" + cfg["aws"]["s3"]["prefix"] = "testing/external_demo_project" + cfg["aws"]["emr"] = {} + cfg["aws"]["emr"]["manager_instance_type"] = "m5.xlarge" + cfg["aws"]["emr"]["worker_instance_type"] = "r5.4xlarge" + cfg["aws"]["emr"]["worker_instance_count"] = 1 + cfg["aws"]["region"] = "us-west-2" + cfg["aws"]["use_spot"] = True + cfg["aws"]["batch_array_size"] = 100 + cfg["aws"]["notifications_email"] = "user@example.com" + with open(project_filename, "w") as f: yaml.dump(cfg, f) - buildstock_directory = cfg['buildstock_directory'] + buildstock_directory = cfg["buildstock_directory"] batch = AwsBatch(project_filename) batch.build_image() - gem_list_log_log_path = os.path.join(buildstock_directory, - 'resources', - '.aws_docker_image', - 'openstudio_gem_list_output.log') + gem_list_log_log_path = os.path.join( + buildstock_directory, + "resources", + ".aws_docker_image", + "openstudio_gem_list_output.log", + ) assert os.path.exists(gem_list_log_log_path) - with open(gem_list_log_log_path, 'r') as gem_list: + with open(gem_list_log_log_path, "r") as gem_list: contents = gem_list.read() - custom_gem = '/var/oscli/gems/ruby/2.7.0/gems/openstudio-standards-0.2.0' + custom_gem = "/var/oscli/gems/ruby/2.7.0/gems/openstudio-standards-0.2.0" assert custom_gem in contents @@ -54,36 +56,38 @@ def test_no_custom_gem_install(basic_residential_project_file): project_filename, results_dir = basic_residential_project_file() # Add aws to the project file - with open(project_filename, 'r') as f: + with open(project_filename, "r") as f: cfg = yaml.safe_load(f) # AWS - cfg['aws'] = {} - cfg['aws']['job_identifier'] = 'testaws' - cfg['aws']['s3'] = {} - cfg['aws']['s3']['bucket'] = 'resbldg-datasets' - cfg['aws']['s3']['prefix'] = 'testing/external_demo_project' - cfg['aws']['emr'] = {} - cfg['aws']['emr']['manager_instance_type'] = 'm5.xlarge' - cfg['aws']['emr']['worker_instance_type'] = 'r5.4xlarge' - cfg['aws']['emr']['worker_instance_count'] = 1 - cfg['aws']['region'] = 'us-west-2' - cfg['aws']['use_spot'] = True - cfg['aws']['batch_array_size'] = 100 - cfg['aws']['notifications_email'] = 'user@example.com' - with open(project_filename, 'w') as f: + cfg["aws"] = {} + cfg["aws"]["job_identifier"] = "testaws" + cfg["aws"]["s3"] = {} + cfg["aws"]["s3"]["bucket"] = "resbldg-datasets" + cfg["aws"]["s3"]["prefix"] = "testing/external_demo_project" + cfg["aws"]["emr"] = {} + cfg["aws"]["emr"]["manager_instance_type"] = "m5.xlarge" + cfg["aws"]["emr"]["worker_instance_type"] = "r5.4xlarge" + cfg["aws"]["emr"]["worker_instance_count"] = 1 + cfg["aws"]["region"] = "us-west-2" + cfg["aws"]["use_spot"] = True + cfg["aws"]["batch_array_size"] = 100 + cfg["aws"]["notifications_email"] = "user@example.com" + with open(project_filename, "w") as f: yaml.dump(cfg, f) - buildstock_directory = cfg['buildstock_directory'] + buildstock_directory = cfg["buildstock_directory"] batch = AwsBatch(project_filename) batch.build_image() - gem_list_log_log_path = os.path.join(buildstock_directory, - 'resources', - '.aws_docker_image', - 'openstudio_gem_list_output.log') + gem_list_log_log_path = os.path.join( + buildstock_directory, + "resources", + ".aws_docker_image", + "openstudio_gem_list_output.log", + ) assert os.path.exists(gem_list_log_log_path) - with open(gem_list_log_log_path, 'r') as gem_list: + with open(gem_list_log_log_path, "r") as gem_list: contents = gem_list.read() - custom_gem = '/var/oscli/gems/ruby/2.7.0/gems/openstudio-standards-0.2.0' + custom_gem = "/var/oscli/gems/ruby/2.7.0/gems/openstudio-standards-0.2.0" assert custom_gem not in contents diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py index 5c9c0dfe..7ae03173 100644 --- a/buildstockbatch/test/test_base.py +++ b/buildstockbatch/test/test_base.py @@ -22,10 +22,10 @@ from buildstockbatch.postprocessing import write_dataframe_as_parquet from buildstockbatch.utils import read_csv, ContainerRuntime -dask.config.set(scheduler='synchronous') +dask.config.set(scheduler="synchronous") here = os.path.dirname(os.path.abspath(__file__)) -OUTPUT_FOLDER_NAME = 'output' +OUTPUT_FOLDER_NAME = "output" buildstockbatch.postprocessing.performance_report = MagicMock() @@ -34,59 +34,75 @@ def test_reference_scenario(basic_residential_project_file): # verify that the reference_scenario get's added to the upgrade file upgrade_config = { - 'upgrades': [ + "upgrades": [ { - 'upgrade_name': 'Triple-Pane Windows', - 'reference_scenario': 'example_reference_scenario' + "upgrade_name": "Triple-Pane Windows", + "reference_scenario": "example_reference_scenario", } ] } project_filename, results_dir = basic_residential_project_file(upgrade_config) - with patch.object(BuildStockBatchBase, 'weather_dir', None), \ - patch.object(BuildStockBatchBase, 'get_dask_client') as get_dask_client_mock, \ - patch.object(BuildStockBatchBase, 'results_dir', results_dir): + with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( + BuildStockBatchBase, "get_dask_client" + ) as get_dask_client_mock, patch.object( + BuildStockBatchBase, "results_dir", results_dir + ): bsb = BuildStockBatchBase(project_filename) bsb.process_results() get_dask_client_mock.assert_called_once() # test results.csv files - test_path = os.path.join(results_dir, 'results_csvs') - test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).set_index('building_id').sort_index() - assert len(test_csv['apply_upgrade.reference_scenario'].unique()) == 1 - assert test_csv['apply_upgrade.reference_scenario'].iloc[0] == 'example_reference_scenario' + test_path = os.path.join(results_dir, "results_csvs") + test_csv = ( + read_csv(os.path.join(test_path, "results_up01.csv.gz")) + .set_index("building_id") + .sort_index() + ) + assert len(test_csv["apply_upgrade.reference_scenario"].unique()) == 1 + assert ( + test_csv["apply_upgrade.reference_scenario"].iloc[0] + == "example_reference_scenario" + ) def test_downselect_integer_options(basic_residential_project_file, mocker): with tempfile.TemporaryDirectory() as buildstock_csv_dir: - buildstock_csv = os.path.join(buildstock_csv_dir, 'buildstock.csv') + buildstock_csv = os.path.join(buildstock_csv_dir, "buildstock.csv") valid_option_values = set() - with open(os.path.join(here, 'buildstock.csv'), 'r', newline='') as f_in, \ - open(buildstock_csv, 'w', newline='') as f_out: + with open(os.path.join(here, "buildstock.csv"), "r", newline="") as f_in, open( + buildstock_csv, "w", newline="" + ) as f_out: cf_in = csv.reader(f_in) cf_out = csv.writer(f_out) for i, row in enumerate(cf_in): if i == 0: - col_idx = row.index('Days Shifted') + col_idx = row.index("Days Shifted") else: # Convert values from "Day1" to "1.10" so we hit the bug - row[col_idx] = '{0}.{0}0'.format(re.search(r'Day(\d+)', row[col_idx]).group(1)) + row[col_idx] = "{0}.{0}0".format( + re.search(r"Day(\d+)", row[col_idx]).group(1) + ) valid_option_values.add(row[col_idx]) cf_out.writerow(row) - project_filename, results_dir = basic_residential_project_file({ - 'sampler': { - 'type': 'residential_quota_downselect', - 'args': { - 'n_datapoints': 8, - 'resample': False, - 'logic': 'Geometry House Size|1500-2499' + project_filename, results_dir = basic_residential_project_file( + { + "sampler": { + "type": "residential_quota_downselect", + "args": { + "n_datapoints": 8, + "resample": False, + "logic": "Geometry House Size|1500-2499", + }, } } - }) - mocker.patch.object(BuildStockBatchBase, 'weather_dir', None) - mocker.patch.object(BuildStockBatchBase, 'results_dir', results_dir) - sampler_property_mock = mocker.patch.object(BuildStockBatchBase, 'sampler', new_callable=PropertyMock) + ) + mocker.patch.object(BuildStockBatchBase, "weather_dir", None) + mocker.patch.object(BuildStockBatchBase, "results_dir", results_dir) + sampler_property_mock = mocker.patch.object( + BuildStockBatchBase, "sampler", new_callable=PropertyMock + ) sampler_mock = mocker.MagicMock() sampler_property_mock.return_value = sampler_mock sampler_mock.run_sampling = MagicMock(return_value=buildstock_csv) @@ -94,51 +110,66 @@ def test_downselect_integer_options(basic_residential_project_file, mocker): bsb = BuildStockBatchBase(project_filename) bsb.sampler.run_sampling() sampler_mock.run_sampling.assert_called_once() - with open(buildstock_csv, 'r', newline='') as f: + with open(buildstock_csv, "r", newline="") as f: cf = csv.DictReader(f) for row in cf: - assert row['Days Shifted'] in valid_option_values + assert row["Days Shifted"] in valid_option_values -@patch('buildstockbatch.postprocessing.boto3') +@patch("buildstockbatch.postprocessing.boto3") def test_upload_files(mocked_boto3, basic_residential_project_file): - s3_bucket = 'test_bucket' - s3_prefix = 'test_prefix' - db_name = 'test_db_name' - role = 'test_role' - region = 'test_region' + s3_bucket = "test_bucket" + s3_prefix = "test_prefix" + db_name = "test_db_name" + role = "test_role" + region = "test_region" upload_config = { - 'postprocessing': { - 'aws': { - 'region_name': region, - 's3': { - 'bucket': s3_bucket, - 'prefix': s3_prefix, - }, - 'athena': { - 'glue_service_role': role, - 'database_name': db_name, - 'max_crawling_time': 250 - } - } - } - } + "postprocessing": { + "aws": { + "region_name": region, + "s3": { + "bucket": s3_bucket, + "prefix": s3_prefix, + }, + "athena": { + "glue_service_role": role, + "database_name": db_name, + "max_crawling_time": 250, + }, + } + } + } mocked_glueclient = MagicMock() - mocked_glueclient.get_crawler = MagicMock(return_value={'Crawler': {'State': 'READY'}}) + mocked_glueclient.get_crawler = MagicMock( + return_value={"Crawler": {"State": "READY"}} + ) mocked_boto3.client = MagicMock(return_value=mocked_glueclient) - mocked_boto3.resource().Bucket().objects.filter.side_effect = [[], ['a', 'b', 'c']] + mocked_boto3.resource().Bucket().objects.filter.side_effect = [[], ["a", "b", "c"]] project_filename, results_dir = basic_residential_project_file(upload_config) - buildstock_csv_path = Path(results_dir).parent / 'openstudio_buildstock' / 'project_resstock_national' / 'housing_characteristics' / 'buildstock.csv' # noqa: E501 + buildstock_csv_path = ( + Path(results_dir).parent + / "openstudio_buildstock" + / "project_resstock_national" + / "housing_characteristics" + / "buildstock.csv" + ) # noqa: E501 shutil.copy2( - Path(__file__).parent / 'test_results' / 'housing_characteristics' / 'buildstock.csv', - buildstock_csv_path + Path(__file__).parent + / "test_results" + / "housing_characteristics" + / "buildstock.csv", + buildstock_csv_path, ) - with patch.object(BuildStockBatchBase, 'weather_dir', None), \ - patch.object(BuildStockBatchBase, 'output_dir', results_dir), \ - patch.object(BuildStockBatchBase, 'get_dask_client') as get_dask_client_mock, \ - patch.object(BuildStockBatchBase, 'results_dir', results_dir), \ - patch.object(BuildStockBatchBase, 'CONTAINER_RUNTIME', ContainerRuntime.LOCAL_OPENSTUDIO): + with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( + BuildStockBatchBase, "output_dir", results_dir + ), patch.object( + BuildStockBatchBase, "get_dask_client" + ) as get_dask_client_mock, patch.object( + BuildStockBatchBase, "results_dir", results_dir + ), patch.object( + BuildStockBatchBase, "CONTAINER_RUNTIME", ContainerRuntime.LOCAL_OPENSTUDIO + ): bsb = BuildStockBatchBase(project_filename) bsb.process_results() get_dask_client_mock.assert_called_once() @@ -147,158 +178,182 @@ def test_upload_files(mocked_boto3, basic_residential_project_file): crawler_created = False crawler_started = False for call in mocked_boto3.mock_calls[2:] + mocked_boto3.client().mock_calls: - call_function = call[0].split('.')[-1] # 0 is for the function name - if call_function == 'resource': - assert call[1][0] in ['s3'] # call[1] is for the positional arguments - if call_function == 'Bucket': + call_function = call[0].split(".")[-1] # 0 is for the function name + if call_function == "resource": + assert call[1][0] in ["s3"] # call[1] is for the positional arguments + if call_function == "Bucket": assert call[1][0] == s3_bucket - if call_function == 'upload_file': + if call_function == "upload_file": source_file_path = call[1][0] destination_path = call[1][1] files_uploaded.append((source_file_path, destination_path)) - if call_function == 'create_crawler': + if call_function == "create_crawler": crawler_para = call[2] # 2 is for the keyword arguments crawler_created = True - assert crawler_para['DatabaseName'] == upload_config['postprocessing']['aws']['athena']['database_name'] - assert crawler_para['Role'] == upload_config['postprocessing']['aws']['athena']['glue_service_role'] - assert crawler_para['TablePrefix'] == OUTPUT_FOLDER_NAME + '_' - assert crawler_para['Name'] == db_name + '_' + OUTPUT_FOLDER_NAME - assert crawler_para['Targets']['S3Targets'][0]['Path'] == 's3://' + s3_bucket + '/' + s3_prefix + '/' + \ - OUTPUT_FOLDER_NAME + '/' - if call_function == 'start_crawler': + assert ( + crawler_para["DatabaseName"] + == upload_config["postprocessing"]["aws"]["athena"]["database_name"] + ) + assert ( + crawler_para["Role"] + == upload_config["postprocessing"]["aws"]["athena"]["glue_service_role"] + ) + assert crawler_para["TablePrefix"] == OUTPUT_FOLDER_NAME + "_" + assert crawler_para["Name"] == db_name + "_" + OUTPUT_FOLDER_NAME + assert ( + crawler_para["Targets"]["S3Targets"][0]["Path"] + == "s3://" + + s3_bucket + + "/" + + s3_prefix + + "/" + + OUTPUT_FOLDER_NAME + + "/" + ) + if call_function == "start_crawler": assert crawler_created, "crawler attempted to start before creating" crawler_started = True crawler_para = call[2] # 2 is for keyboard arguments. - assert crawler_para['Name'] == db_name + '_' + OUTPUT_FOLDER_NAME + assert crawler_para["Name"] == db_name + "_" + OUTPUT_FOLDER_NAME assert crawler_started, "Crawler never started" # check if all the files are properly uploaded - source_path = os.path.join(results_dir, 'parquet') - s3_path = s3_prefix + '/' + OUTPUT_FOLDER_NAME + '/' + source_path = os.path.join(results_dir, "parquet") + s3_path = s3_prefix + "/" + OUTPUT_FOLDER_NAME + "/" - s3_file_path = s3_path + 'baseline/results_up00.parquet' - source_file_path = os.path.join(source_path, 'baseline', 'results_up00.parquet') + s3_file_path = s3_path + "baseline/results_up00.parquet" + source_file_path = os.path.join(source_path, "baseline", "results_up00.parquet") assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - s3_file_path = s3_path + 'upgrades/upgrade=1/results_up01.parquet' - source_file_path = os.path.join(source_path, 'upgrades', 'upgrade=1', 'results_up01.parquet') + s3_file_path = s3_path + "upgrades/upgrade=1/results_up01.parquet" + source_file_path = os.path.join( + source_path, "upgrades", "upgrade=1", "results_up01.parquet" + ) assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - s3_file_path = s3_path + 'timeseries/upgrade=0/group0.parquet' - source_file_path = os.path.join(source_path, 'timeseries', 'upgrade=0', 'group0.parquet') + s3_file_path = s3_path + "timeseries/upgrade=0/group0.parquet" + source_file_path = os.path.join( + source_path, "timeseries", "upgrade=0", "group0.parquet" + ) assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - s3_file_path = s3_path + 'timeseries/upgrade=1/group0.parquet' - source_file_path = os.path.join(source_path, 'timeseries', 'upgrade=1', 'group0.parquet') + s3_file_path = s3_path + "timeseries/upgrade=1/group0.parquet" + source_file_path = os.path.join( + source_path, "timeseries", "upgrade=1", "group0.parquet" + ) assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - s3_file_path = s3_path + 'timeseries/_common_metadata' - source_file_path = os.path.join(source_path, 'timeseries', '_common_metadata') + s3_file_path = s3_path + "timeseries/_common_metadata" + source_file_path = os.path.join(source_path, "timeseries", "_common_metadata") assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - s3_file_path = s3_path + 'timeseries/_metadata' - source_file_path = os.path.join(source_path, 'timeseries', '_metadata') + s3_file_path = s3_path + "timeseries/_metadata" + source_file_path = os.path.join(source_path, "timeseries", "_metadata") assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - s3_file_path = s3_path + 'buildstock_csv/buildstock.csv' + s3_file_path = s3_path + "buildstock_csv/buildstock.csv" source_file_path = str(buildstock_csv_path) assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - assert len(files_uploaded) == 0, f"These files shouldn't have been uploaded: {files_uploaded}" + assert ( + len(files_uploaded) == 0 + ), f"These files shouldn't have been uploaded: {files_uploaded}" def test_write_parquet_no_index(): - df = pd.DataFrame(np.random.randn(6, 4), columns=list('abcd'), index=np.arange(6)) + df = pd.DataFrame(np.random.randn(6, 4), columns=list("abcd"), index=np.arange(6)) with tempfile.TemporaryDirectory() as tmpdir: fs = LocalFileSystem() - filename = os.path.join(tmpdir, 'df.parquet') + filename = os.path.join(tmpdir, "df.parquet") write_dataframe_as_parquet(df, fs, filename) schema = parquet.read_schema(os.path.join(tmpdir, filename)) - assert '__index_level_0__' not in schema.names + assert "__index_level_0__" not in schema.names assert df.columns.values.tolist() == schema.names def test_skipping_baseline(basic_residential_project_file): - project_filename, results_dir = basic_residential_project_file({ - 'baseline': { - 'skip_sims': True, - 'sampling_algorithm': 'quota' - } - }) + project_filename, results_dir = basic_residential_project_file( + {"baseline": {"skip_sims": True, "sampling_algorithm": "quota"}} + ) - sim_output_path = os.path.join(results_dir, 'simulation_output') - shutil.rmtree(os.path.join(sim_output_path, 'timeseries', 'up00')) # remove timeseries results for baseline + sim_output_path = os.path.join(results_dir, "simulation_output") + shutil.rmtree( + os.path.join(sim_output_path, "timeseries", "up00") + ) # remove timeseries results for baseline # remove results.csv data for baseline from results_jobx.json.gz - results_json_filename = os.path.join(sim_output_path, 'results_job0.json.gz') - with gzip.open(results_json_filename, 'rt', encoding='utf-8') as f: + results_json_filename = os.path.join(sim_output_path, "results_job0.json.gz") + with gzip.open(results_json_filename, "rt", encoding="utf-8") as f: dpouts = json.load(f) - dpouts2 = list(filter(lambda x: x['upgrade'] > 0, dpouts)) - with gzip.open(results_json_filename, 'wt', encoding='utf-8') as f: + dpouts2 = list(filter(lambda x: x["upgrade"] > 0, dpouts)) + with gzip.open(results_json_filename, "wt", encoding="utf-8") as f: json.dump(dpouts2, f) # remove jobs for baseline from jobx.json - with open(os.path.join(results_dir, '..', 'job0.json'), 'rt') as f: + with open(os.path.join(results_dir, "..", "job0.json"), "rt") as f: job_json = json.load(f) - job_json['batch'] = list(filter(lambda job: job[1] is not None, job_json['batch'])) - with open(os.path.join(results_dir, '..', 'job0.json'), 'wt') as f: + job_json["batch"] = list(filter(lambda job: job[1] is not None, job_json["batch"])) + with open(os.path.join(results_dir, "..", "job0.json"), "wt") as f: json.dump(job_json, f) # run postprocessing - with patch.object(BuildStockBatchBase, 'weather_dir', None), \ - patch.object(BuildStockBatchBase, 'get_dask_client') as get_dask_client_mock, \ - patch.object(BuildStockBatchBase, 'results_dir', results_dir): + with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( + BuildStockBatchBase, "get_dask_client" + ) as get_dask_client_mock, patch.object( + BuildStockBatchBase, "results_dir", results_dir + ): bsb = BuildStockBatchBase(project_filename) bsb.process_results() get_dask_client_mock.assert_called_once() - up00_parquet = os.path.join(results_dir, 'parquet', 'baseline', 'results_up00.parquet') + up00_parquet = os.path.join( + results_dir, "parquet", "baseline", "results_up00.parquet" + ) assert not os.path.exists(up00_parquet) - up01_parquet = os.path.join(results_dir, 'parquet', 'upgrades', 'upgrade=1', 'results_up01.parquet') + up01_parquet = os.path.join( + results_dir, "parquet", "upgrades", "upgrade=1", "results_up01.parquet" + ) assert os.path.exists(up01_parquet) - up00_csv_gz = os.path.join(results_dir, 'results_csvs', 'results_up00.csv.gz') + up00_csv_gz = os.path.join(results_dir, "results_csvs", "results_up00.csv.gz") assert not os.path.exists(up00_csv_gz) - up01_csv_gz = os.path.join(results_dir, 'results_csvs', 'results_up01.csv.gz') + up01_csv_gz = os.path.join(results_dir, "results_csvs", "results_up01.csv.gz") assert os.path.exists(up01_csv_gz) def test_provide_buildstock_csv(basic_residential_project_file, mocker): - buildstock_csv = os.path.join(here, 'buildstock.csv') + buildstock_csv = os.path.join(here, "buildstock.csv") df = read_csv(buildstock_csv, dtype=str) - project_filename, results_dir = basic_residential_project_file({ - 'sampler': { - 'type': 'precomputed', - 'args': { - 'sample_file': buildstock_csv - } - } - }) - mocker.patch.object(LocalBatch, 'weather_dir', None) - mocker.patch.object(LocalBatch, 'results_dir', results_dir) + project_filename, results_dir = basic_residential_project_file( + {"sampler": {"type": "precomputed", "args": {"sample_file": buildstock_csv}}} + ) + mocker.patch.object(LocalBatch, "weather_dir", None) + mocker.patch.object(LocalBatch, "results_dir", results_dir) bsb = LocalBatch(project_filename) sampling_output_csv = bsb.sampler.run_sampling() df2 = read_csv(sampling_output_csv, dtype=str) pd.testing.assert_frame_equal(df, df2) - assert (df['Geometry Shared Walls'] == "None").all() # Verify None is being read properly + assert ( + df["Geometry Shared Walls"] == "None" + ).all() # Verify None is being read properly # Test file missing - with open(project_filename, 'r') as f: + with open(project_filename, "r") as f: cfg = yaml.safe_load(f) - cfg['sampler']['args']['sample_file'] = os.path.join(here, 'non_existant_file.csv') - with open(project_filename, 'w') as f: + cfg["sampler"]["args"]["sample_file"] = os.path.join(here, "non_existant_file.csv") + with open(project_filename, "w") as f: yaml.dump(cfg, f) with pytest.raises(ValidationError, match=r"sample_file doesn't exist"): diff --git a/buildstockbatch/test/test_docker.py b/buildstockbatch/test/test_docker.py index 36f110eb..2a18b605 100644 --- a/buildstockbatch/test/test_docker.py +++ b/buildstockbatch/test/test_docker.py @@ -12,26 +12,27 @@ def test_custom_gem_install(basic_residential_project_file): project_filename, results_dir = basic_residential_project_file() # Add custom_gems to the project file - with open(project_filename, 'r') as f: + with open(project_filename, "r") as f: cfg = yaml.safe_load(f) - cfg['baseline']['custom_gems'] = True - with open(project_filename, 'w') as f: + cfg["baseline"]["custom_gems"] = True + with open(project_filename, "w") as f: yaml.dump(cfg, f) - buildstock_directory = cfg['buildstock_directory'] + buildstock_directory = cfg["buildstock_directory"] LocalBatch(project_filename) - bundle_install_log_path = os.path.join(buildstock_directory, - 'resources', - '.custom_gems', - 'bundle_install_output.log') + bundle_install_log_path = os.path.join( + buildstock_directory, "resources", ".custom_gems", "bundle_install_output.log" + ) assert os.path.exists(bundle_install_log_path) os.remove(bundle_install_log_path) - gem_list_log_log_path = os.path.join(buildstock_directory, - 'resources', - '.custom_gems', - 'openstudio_gem_list_output.log') + gem_list_log_log_path = os.path.join( + buildstock_directory, + "resources", + ".custom_gems", + "openstudio_gem_list_output.log", + ) assert os.path.exists(gem_list_log_log_path) os.remove(gem_list_log_log_path) diff --git a/buildstockbatch/test/test_eagle.py b/buildstockbatch/test/test_eagle.py index 91a16da3..c33d8f1e 100644 --- a/buildstockbatch/test/test_eagle.py +++ b/buildstockbatch/test/test_eagle.py @@ -15,258 +15,314 @@ here = os.path.dirname(os.path.abspath(__file__)) -@patch('buildstockbatch.eagle.subprocess') +@patch("buildstockbatch.eagle.subprocess") def test_hpc_run_building(mock_subprocess, monkeypatch, basic_residential_project_file): - tar_filename = pathlib.Path(__file__).resolve().parent / 'test_results' / 'simulation_output' / 'simulations_job0.tar.gz' # noqa E501 - with tarfile.open(tar_filename, 'r') as tarf: - osw_dict = json.loads(tarf.extractfile('up00/bldg0000001/in.osw').read().decode('utf-8')) + tar_filename = ( + pathlib.Path(__file__).resolve().parent + / "test_results" + / "simulation_output" + / "simulations_job0.tar.gz" + ) # noqa E501 + with tarfile.open(tar_filename, "r") as tarf: + osw_dict = json.loads( + tarf.extractfile("up00/bldg0000001/in.osw").read().decode("utf-8") + ) project_filename, results_dir = basic_residential_project_file() tmp_path = pathlib.Path(results_dir).parent - sim_path = tmp_path / 'output' / 'simulation_output' / 'up00' / 'bldg0000001' + sim_path = tmp_path / "output" / "simulation_output" / "up00" / "bldg0000001" os.makedirs(sim_path) cfg = get_project_configuration(project_filename) - with patch.object(EagleBatch, 'weather_dir', None), \ - patch.object(EagleBatch, 'create_osw', return_value=osw_dict), \ - patch.object(EagleBatch, 'make_sim_dir', return_value=('bldg0000001up00', sim_path)), \ - patch.object(EagleBatch, 'local_scratch', tmp_path): + with patch.object(EagleBatch, "weather_dir", None), patch.object( + EagleBatch, "create_osw", return_value=osw_dict + ), patch.object( + EagleBatch, "make_sim_dir", return_value=("bldg0000001up00", sim_path) + ), patch.object( + EagleBatch, "local_scratch", tmp_path + ): # Normal run - run_bldg_args = [ - results_dir, - cfg, - 1, - None - ] + run_bldg_args = [results_dir, cfg, 1, None] EagleBatch.run_building(*run_bldg_args) expected_singularity_args = [ - 'singularity', - 'exec', - '--contain', - '-e', - '--pwd', - '/var/simdata/openstudio', + "singularity", + "exec", + "--contain", + "-e", + "--pwd", + "/var/simdata/openstudio", ] end_expected_singularity_args = [ - str(pathlib.Path('/tmp/scratch/openstudio.simg')), - 'bash', '-x' + str(pathlib.Path("/tmp/scratch/openstudio.simg")), + "bash", + "-x", ] mock_subprocess.run.assert_called_once() args = mock_subprocess.run.call_args[0][0] - for a, b in [args[i:i+2] for i in range(6, len(args) - 3, 2)]: - assert a == '-B' + for a, b in [args[i : i + 2] for i in range(6, len(args) - 3, 2)]: + assert a == "-B" drive, tail = os.path.splitdrive(b) - assert tail.split(':')[1] in ( - '/var/simdata/openstudio', - '/lib/resources', - '/lib/housing_characteristics', - '/measures', - '/weather', - '/tmp', + assert tail.split(":")[1] in ( + "/var/simdata/openstudio", + "/lib/resources", + "/lib/housing_characteristics", + "/measures", + "/weather", + "/tmp", ) assert mock_subprocess.run.call_args[0][0][0:6] == expected_singularity_args assert mock_subprocess.run.call_args[0][0][-3:] == end_expected_singularity_args called_kw = mock_subprocess.run.call_args[1] - assert called_kw.get('check') is True - assert 'input' in called_kw - assert 'stdout' in called_kw - assert 'stderr' in called_kw - assert str(called_kw.get('cwd')) == str(pathlib.Path('/tmp/scratch/output')) - assert called_kw['input'].decode('utf-8').find(' --measures_only') == -1 + assert called_kw.get("check") is True + assert "input" in called_kw + assert "stdout" in called_kw + assert "stderr" in called_kw + assert str(called_kw.get("cwd")) == str(pathlib.Path("/tmp/scratch/output")) + assert called_kw["input"].decode("utf-8").find(" --measures_only") == -1 # Measures only run mock_subprocess.reset_mock() shutil.rmtree(sim_path) os.makedirs(sim_path) - monkeypatch.setenv('MEASURESONLY', '1') + monkeypatch.setenv("MEASURESONLY", "1") EagleBatch.run_building(*run_bldg_args) mock_subprocess.run.assert_called_once() assert mock_subprocess.run.call_args[0][0][0:6] == expected_singularity_args assert mock_subprocess.run.call_args[0][0][-3:] == end_expected_singularity_args called_kw = mock_subprocess.run.call_args[1] - assert called_kw.get('check') is True - assert 'input' in called_kw - assert 'stdout' in called_kw - assert 'stderr' in called_kw - assert str(called_kw.get('cwd')) == str(pathlib.Path('/tmp/scratch/output')) - assert called_kw['input'].decode('utf-8').find(' --measures_only') > -1 - - -@patch('buildstockbatch.base.BuildStockBatchBase.validate_options_lookup') -@patch('buildstockbatch.eagle.EagleBatch.validate_output_directory_eagle') -@patch('buildstockbatch.eagle.EagleBatch.validate_singularity_image_eagle') -@patch('buildstockbatch.eagle.subprocess') -def test_user_cli(mock_subprocess, mock_validate_singularity_image_eagle, mock_validate_output_directory_eagle, - mock_validate_options, basic_residential_project_file, monkeypatch): + assert called_kw.get("check") is True + assert "input" in called_kw + assert "stdout" in called_kw + assert "stderr" in called_kw + assert str(called_kw.get("cwd")) == str(pathlib.Path("/tmp/scratch/output")) + assert called_kw["input"].decode("utf-8").find(" --measures_only") > -1 + + +@patch("buildstockbatch.base.BuildStockBatchBase.validate_options_lookup") +@patch("buildstockbatch.eagle.EagleBatch.validate_output_directory_eagle") +@patch("buildstockbatch.eagle.EagleBatch.validate_singularity_image_eagle") +@patch("buildstockbatch.eagle.subprocess") +def test_user_cli( + mock_subprocess, + mock_validate_singularity_image_eagle, + mock_validate_output_directory_eagle, + mock_validate_options, + basic_residential_project_file, + monkeypatch, +): mock_validate_options.return_value = True mock_validate_output_directory_eagle.return_value = True mock_validate_singularity_image_eagle.return_value = True project_filename, results_dir = basic_residential_project_file() shutil.rmtree(results_dir) - monkeypatch.setenv('CONDA_PREFIX', 'something') + monkeypatch.setenv("CONDA_PREFIX", "something") argv = [project_filename] user_cli(argv) mock_subprocess.run.assert_called_once() - eagle_sh = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'eagle.sh')) + eagle_sh = os.path.abspath( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "eagle.sh") + ) assert mock_subprocess.run.call_args[0][0][-1] == eagle_sh - assert '--time=20' in mock_subprocess.run.call_args[0][0] - assert '--account=testaccount' in mock_subprocess.run.call_args[0][0] - assert '--nodes=1' in mock_subprocess.run.call_args[0][0] - assert '--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY' in mock_subprocess.run.call_args[0][0] - assert '--output=sampling.out' in mock_subprocess.run.call_args[0][0] - assert '--qos=high' not in mock_subprocess.run.call_args[0][0] - assert '0' == mock_subprocess.run.call_args[1]['env']['MEASURESONLY'] + assert "--time=20" in mock_subprocess.run.call_args[0][0] + assert "--account=testaccount" in mock_subprocess.run.call_args[0][0] + assert "--nodes=1" in mock_subprocess.run.call_args[0][0] + assert ( + "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" + in mock_subprocess.run.call_args[0][0] + ) + assert "--output=sampling.out" in mock_subprocess.run.call_args[0][0] + assert "--qos=high" not in mock_subprocess.run.call_args[0][0] + assert "0" == mock_subprocess.run.call_args[1]["env"]["MEASURESONLY"] mock_subprocess.reset_mock() shutil.rmtree(results_dir) - argv = ['--hipri', project_filename] + argv = ["--hipri", project_filename] user_cli(argv) mock_subprocess.run.assert_called_once() - assert '--time=20' in mock_subprocess.run.call_args[0][0] - assert '--account=testaccount' in mock_subprocess.run.call_args[0][0] - assert '--nodes=1' in mock_subprocess.run.call_args[0][0] - assert '--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY' in mock_subprocess.run.call_args[0][0] - assert '--output=sampling.out' in mock_subprocess.run.call_args[0][0] - assert '--qos=high' in mock_subprocess.run.call_args[0][0] - assert '0' == mock_subprocess.run.call_args[1]['env']['MEASURESONLY'] - assert '0' == mock_subprocess.run.call_args[1]['env']['SAMPLINGONLY'] + assert "--time=20" in mock_subprocess.run.call_args[0][0] + assert "--account=testaccount" in mock_subprocess.run.call_args[0][0] + assert "--nodes=1" in mock_subprocess.run.call_args[0][0] + assert ( + "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" + in mock_subprocess.run.call_args[0][0] + ) + assert "--output=sampling.out" in mock_subprocess.run.call_args[0][0] + assert "--qos=high" in mock_subprocess.run.call_args[0][0] + assert "0" == mock_subprocess.run.call_args[1]["env"]["MEASURESONLY"] + assert "0" == mock_subprocess.run.call_args[1]["env"]["SAMPLINGONLY"] mock_subprocess.reset_mock() shutil.rmtree(results_dir) - argv = ['--measures_only', project_filename] + argv = ["--measures_only", project_filename] user_cli(argv) mock_subprocess.run.assert_called_once() - assert '--time=20' in mock_subprocess.run.call_args[0][0] - assert '--account=testaccount' in mock_subprocess.run.call_args[0][0] - assert '--nodes=1' in mock_subprocess.run.call_args[0][0] - assert '--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY' in mock_subprocess.run.call_args[0][0] - assert '--output=sampling.out' in mock_subprocess.run.call_args[0][0] - assert '--qos=high' not in mock_subprocess.run.call_args[0][0] - assert '1' == mock_subprocess.run.call_args[1]['env']['MEASURESONLY'] - assert '0' == mock_subprocess.run.call_args[1]['env']['SAMPLINGONLY'] + assert "--time=20" in mock_subprocess.run.call_args[0][0] + assert "--account=testaccount" in mock_subprocess.run.call_args[0][0] + assert "--nodes=1" in mock_subprocess.run.call_args[0][0] + assert ( + "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" + in mock_subprocess.run.call_args[0][0] + ) + assert "--output=sampling.out" in mock_subprocess.run.call_args[0][0] + assert "--qos=high" not in mock_subprocess.run.call_args[0][0] + assert "1" == mock_subprocess.run.call_args[1]["env"]["MEASURESONLY"] + assert "0" == mock_subprocess.run.call_args[1]["env"]["SAMPLINGONLY"] mock_subprocess.reset_mock() shutil.rmtree(results_dir) - argv = ['--samplingonly', project_filename] + argv = ["--samplingonly", project_filename] user_cli(argv) mock_subprocess.run.assert_called_once() - assert '--time=20' in mock_subprocess.run.call_args[0][0] - assert '--account=testaccount' in mock_subprocess.run.call_args[0][0] - assert '--nodes=1' in mock_subprocess.run.call_args[0][0] - assert '--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY' in mock_subprocess.run.call_args[0][0] - assert '--output=sampling.out' in mock_subprocess.run.call_args[0][0] - assert '--qos=high' not in mock_subprocess.run.call_args[0][0] - assert '1' == mock_subprocess.run.call_args[1]['env']['SAMPLINGONLY'] - assert '0' == mock_subprocess.run.call_args[1]['env']['MEASURESONLY'] - - -@patch('buildstockbatch.eagle.subprocess') -def test_qos_high_job_submit(mock_subprocess, basic_residential_project_file, monkeypatch): - mock_subprocess.run.return_value.stdout = 'Submitted batch job 1\n' + assert "--time=20" in mock_subprocess.run.call_args[0][0] + assert "--account=testaccount" in mock_subprocess.run.call_args[0][0] + assert "--nodes=1" in mock_subprocess.run.call_args[0][0] + assert ( + "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" + in mock_subprocess.run.call_args[0][0] + ) + assert "--output=sampling.out" in mock_subprocess.run.call_args[0][0] + assert "--qos=high" not in mock_subprocess.run.call_args[0][0] + assert "1" == mock_subprocess.run.call_args[1]["env"]["SAMPLINGONLY"] + assert "0" == mock_subprocess.run.call_args[1]["env"]["MEASURESONLY"] + + +@patch("buildstockbatch.eagle.subprocess") +def test_qos_high_job_submit( + mock_subprocess, basic_residential_project_file, monkeypatch +): + mock_subprocess.run.return_value.stdout = "Submitted batch job 1\n" mock_subprocess.PIPE = None project_filename, results_dir = basic_residential_project_file() shutil.rmtree(results_dir) - monkeypatch.setenv('CONDA_PREFIX', 'something') - monkeypatch.setenv('SLURM_JOB_QOS', 'high') + monkeypatch.setenv("CONDA_PREFIX", "something") + monkeypatch.setenv("SLURM_JOB_QOS", "high") - with patch.object(EagleBatch, 'weather_dir', None): + with patch.object(EagleBatch, "weather_dir", None): batch = EagleBatch(project_filename) for i in range(1, 11): - pathlib.Path(results_dir, 'job{:03d}.json'.format(i)).touch() - with open(os.path.join(results_dir, 'job001.json'), 'w') as f: - json.dump({'batch': list(range(100))}, f) + pathlib.Path(results_dir, "job{:03d}.json".format(i)).touch() + with open(os.path.join(results_dir, "job001.json"), "w") as f: + json.dump({"batch": list(range(100))}, f) batch.queue_jobs() mock_subprocess.run.assert_called_once() - assert '--qos=high' in mock_subprocess.run.call_args[0][0] + assert "--qos=high" in mock_subprocess.run.call_args[0][0] mock_subprocess.reset_mock() - mock_subprocess.run.return_value.stdout = 'Submitted batch job 1\n' + mock_subprocess.run.return_value.stdout = "Submitted batch job 1\n" mock_subprocess.PIPE = None - with patch.object(EagleBatch, 'weather_dir', None): + with patch.object(EagleBatch, "weather_dir", None): batch = EagleBatch(project_filename) batch.queue_post_processing() mock_subprocess.run.assert_called_once() - assert '--qos=high' in mock_subprocess.run.call_args[0][0] + assert "--qos=high" in mock_subprocess.run.call_args[0][0] -def test_queue_jobs_minutes_per_sim(mocker, basic_residential_project_file, monkeypatch): - mock_subprocess = mocker.patch('buildstockbatch.eagle.subprocess') - mocker.patch.object(EagleBatch, 'weather_dir', None) - mock_subprocess.run.return_value.stdout = 'Submitted batch job 1\n' +def test_queue_jobs_minutes_per_sim( + mocker, basic_residential_project_file, monkeypatch +): + mock_subprocess = mocker.patch("buildstockbatch.eagle.subprocess") + mocker.patch.object(EagleBatch, "weather_dir", None) + mock_subprocess.run.return_value.stdout = "Submitted batch job 1\n" mock_subprocess.PIPE = None - project_filename, results_dir = basic_residential_project_file(update_args={ - 'eagle': { - 'sampling': { - 'time': 20 - }, - 'account': 'testaccount', - 'minutes_per_sim': 0.5 + project_filename, results_dir = basic_residential_project_file( + update_args={ + "eagle": { + "sampling": {"time": 20}, + "account": "testaccount", + "minutes_per_sim": 0.5, + } } - }) + ) shutil.rmtree(results_dir) - monkeypatch.setenv('CONDA_PREFIX', 'something') + monkeypatch.setenv("CONDA_PREFIX", "something") batch = EagleBatch(project_filename) for i in range(1, 11): - pathlib.Path(results_dir, 'job{:03d}.json'.format(i)).touch() - with open(os.path.join(results_dir, 'job001.json'), 'w') as f: - json.dump({'batch': list(range(1000))}, f) + pathlib.Path(results_dir, "job{:03d}.json".format(i)).touch() + with open(os.path.join(results_dir, "job001.json"), "w") as f: + json.dump({"batch": list(range(1000))}, f) batch.queue_jobs() mock_subprocess.run.assert_called_once() - assert '--time=14' in mock_subprocess.run.call_args[0][0] + assert "--time=14" in mock_subprocess.run.call_args[0][0] -def test_run_building_process(mocker, basic_residential_project_file): +def test_run_building_process(mocker, basic_residential_project_file): project_filename, results_dir = basic_residential_project_file(raw=True) results_dir = pathlib.Path(results_dir) job_json = { - 'job_num': 1, - 'batch': [(1, 0), (2, 0), (3, 0), (4, 0), (1, None), (2, None), (3, None), (4, None)], - 'n_datapoints': 8 + "job_num": 1, + "batch": [ + (1, 0), + (2, 0), + (3, 0), + (4, 0), + (1, None), + (2, None), + (3, None), + (4, None), + ], + "n_datapoints": 8, } - with open(results_dir / 'job001.json', 'w') as f: + with open(results_dir / "job001.json", "w") as f: json.dump(job_json, f) - sample_buildstock_csv = pd.DataFrame.from_records([{'Building': i, 'Dummy Column': i*i} for i in range(10)]) - os.makedirs(results_dir / 'housing_characteristics', exist_ok=True) - os.makedirs(results_dir / 'weather', exist_ok=True) - sample_buildstock_csv.to_csv(results_dir / 'housing_characteristics' / 'buildstock.csv', index=False) + sample_buildstock_csv = pd.DataFrame.from_records( + [{"Building": i, "Dummy Column": i * i} for i in range(10)] + ) + os.makedirs(results_dir / "housing_characteristics", exist_ok=True) + os.makedirs(results_dir / "weather", exist_ok=True) + sample_buildstock_csv.to_csv( + results_dir / "housing_characteristics" / "buildstock.csv", index=False + ) def sequential_parallel(**kwargs): kw2 = kwargs.copy() - kw2['n_jobs'] = 1 + kw2["n_jobs"] = 1 return joblib.Parallel(**kw2) - mocker.patch('buildstockbatch.eagle.shutil.copy2') - mocker.patch('buildstockbatch.eagle.Parallel', sequential_parallel) - mocker.patch('buildstockbatch.eagle.subprocess') - - mocker.patch.object(EagleBatch, 'local_buildstock_dir', results_dir / 'local_buildstock_dir') - mocker.patch.object(EagleBatch, 'local_weather_dir', results_dir / 'local_weather_dir') - mocker.patch.object(EagleBatch, 'local_output_dir', results_dir) - mocker.patch.object(EagleBatch, 'local_housing_characteristics_dir', - results_dir / 'local_housing_characteristics_dir') - mocker.patch.object(EagleBatch, 'results_dir', results_dir) - mocker.patch.object(EagleBatch, 'local_scratch', results_dir.parent) + mocker.patch("buildstockbatch.eagle.shutil.copy2") + mocker.patch("buildstockbatch.eagle.Parallel", sequential_parallel) + mocker.patch("buildstockbatch.eagle.subprocess") + + mocker.patch.object( + EagleBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir" + ) + mocker.patch.object( + EagleBatch, "local_weather_dir", results_dir / "local_weather_dir" + ) + mocker.patch.object(EagleBatch, "local_output_dir", results_dir) + mocker.patch.object( + EagleBatch, + "local_housing_characteristics_dir", + results_dir / "local_housing_characteristics_dir", + ) + mocker.patch.object(EagleBatch, "results_dir", results_dir) + mocker.patch.object(EagleBatch, "local_scratch", results_dir.parent) def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=False): real_upgrade_idx = 0 if upgrade_idx is None else upgrade_idx + 1 - sim_id = f'bldg{building_id:07d}up{real_upgrade_idx:02d}' - sim_dir = os.path.join(base_dir, f'up{real_upgrade_idx:02d}', f'bldg{building_id:07d}') + sim_id = f"bldg{building_id:07d}up{real_upgrade_idx:02d}" + sim_dir = os.path.join( + base_dir, f"up{real_upgrade_idx:02d}", f"bldg{building_id:07d}" + ) return sim_id, sim_dir - mocker.patch.object(EagleBatch, 'make_sim_dir', make_sim_dir_mock) - sampler_prop_mock = mocker.patch.object(EagleBatch, 'sampler', new_callable=mocker.PropertyMock) + mocker.patch.object(EagleBatch, "make_sim_dir", make_sim_dir_mock) + sampler_prop_mock = mocker.patch.object( + EagleBatch, "sampler", new_callable=mocker.PropertyMock + ) sampler_mock = mocker.MagicMock() sampler_prop_mock.return_value = sampler_mock - sampler_mock.csv_path = results_dir.parent / 'housing_characteristic2' / 'buildstock.csv' - sampler_mock.run_sampling = mocker.MagicMock(return_value='buildstock.csv') + sampler_mock.csv_path = ( + results_dir.parent / "housing_characteristic2" / "buildstock.csv" + ) + sampler_mock.run_sampling = mocker.MagicMock(return_value="buildstock.csv") b = EagleBatch(project_filename) b.run_batch(sampling_only=True) # so the directories can be created @@ -274,33 +330,60 @@ def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=Fal b.run_job_batch(1) # check results job-json - refrence_path = pathlib.Path(__file__).resolve().parent / 'test_results' / 'reference_files' + refrence_path = ( + pathlib.Path(__file__).resolve().parent / "test_results" / "reference_files" + ) - refrence_list = json.loads(gzip.open(refrence_path / 'results_job1.json.gz', 'r').read()) + refrence_list = json.loads( + gzip.open(refrence_path / "results_job1.json.gz", "r").read() + ) - output_list = json.loads(gzip.open(results_dir / 'simulation_output' / 'results_job1.json.gz', 'r').read()) + output_list = json.loads( + gzip.open( + results_dir / "simulation_output" / "results_job1.json.gz", "r" + ).read() + ) refrence_list = [json.dumps(d) for d in refrence_list] output_list = [json.dumps(d) for d in output_list] assert sorted(refrence_list) == sorted(output_list) - ts_files = list(refrence_path.glob('**/*.parquet')) + ts_files = list(refrence_path.glob("**/*.parquet")) def compare_ts_parquets(source, dst): - test_pq = pd.read_parquet(source).reset_index().drop(columns=['index']).rename(columns=str.lower) - reference_pq = pd.read_parquet(dst).reset_index().drop(columns=['index']).rename(columns=str.lower) + test_pq = ( + pd.read_parquet(source) + .reset_index() + .drop(columns=["index"]) + .rename(columns=str.lower) + ) + reference_pq = ( + pd.read_parquet(dst) + .reset_index() + .drop(columns=["index"]) + .rename(columns=str.lower) + ) pd.testing.assert_frame_equal(test_pq, reference_pq) for file in ts_files: - results_file = results_dir / 'results' / 'simulation_output' / 'timeseries' / file.parent.name / file.name + results_file = ( + results_dir + / "results" + / "simulation_output" + / "timeseries" + / file.parent.name + / file.name + ) compare_ts_parquets(file, results_file) # Check that buildstock.csv was trimmed properly - local_buildstock_df = read_csv(results_dir / 'local_housing_characteristics_dir' / 'buildstock.csv', dtype=str) - unique_buildings = {str(x[0]) for x in job_json['batch']} + local_buildstock_df = read_csv( + results_dir / "local_housing_characteristics_dir" / "buildstock.csv", dtype=str + ) + unique_buildings = {str(x[0]) for x in job_json["batch"]} assert len(unique_buildings) == len(local_buildstock_df) - assert unique_buildings == set(local_buildstock_df['Building']) + assert unique_buildings == set(local_buildstock_df["Building"]) def test_run_building_error_caught(mocker, basic_residential_project_file): @@ -308,76 +391,85 @@ def test_run_building_error_caught(mocker, basic_residential_project_file): project_filename, results_dir = basic_residential_project_file() results_dir = pathlib.Path(results_dir) - job_json = { - 'job_num': 1, - 'batch': [(1, 0)], - 'n_datapoints': 1 - } - with open(results_dir / 'job001.json', 'w') as f: + job_json = {"job_num": 1, "batch": [(1, 0)], "n_datapoints": 1} + with open(results_dir / "job001.json", "w") as f: json.dump(job_json, f) - sample_buildstock_csv = pd.DataFrame.from_records([{'Building': i, 'Dummy Column': i * i} for i in range(10)]) - os.makedirs(results_dir / 'housing_characteristics', exist_ok=True) - os.makedirs(results_dir / 'local_housing_characteristics', exist_ok=True) - os.makedirs(results_dir / 'weather', exist_ok=True) - sample_buildstock_csv.to_csv(results_dir / 'housing_characteristics' / 'buildstock.csv', index=False) + sample_buildstock_csv = pd.DataFrame.from_records( + [{"Building": i, "Dummy Column": i * i} for i in range(10)] + ) + os.makedirs(results_dir / "housing_characteristics", exist_ok=True) + os.makedirs(results_dir / "local_housing_characteristics", exist_ok=True) + os.makedirs(results_dir / "weather", exist_ok=True) + sample_buildstock_csv.to_csv( + results_dir / "housing_characteristics" / "buildstock.csv", index=False + ) def raise_error(*args, **kwargs): - raise RuntimeError('A problem happened') + raise RuntimeError("A problem happened") def sequential_parallel(**kwargs): kw2 = kwargs.copy() - kw2['n_jobs'] = 1 + kw2["n_jobs"] = 1 return joblib.Parallel(**kw2) - mocker.patch('buildstockbatch.eagle.shutil.copy2') - mocker.patch('buildstockbatch.eagle.Parallel', sequential_parallel) - mocker.patch('buildstockbatch.eagle.subprocess') - - mocker.patch.object(EagleBatch, 'run_building', raise_error) - mocker.patch.object(EagleBatch, 'local_output_dir', results_dir) - mocker.patch.object(EagleBatch, 'results_dir', results_dir) - mocker.patch.object(EagleBatch, 'local_buildstock_dir', results_dir / 'local_buildstock_dir') - mocker.patch.object(EagleBatch, 'local_weather_dir', results_dir / 'local_weather_dir') - mocker.patch.object(EagleBatch, 'local_housing_characteristics_dir', - results_dir / 'local_housing_characteristics_dir') + mocker.patch("buildstockbatch.eagle.shutil.copy2") + mocker.patch("buildstockbatch.eagle.Parallel", sequential_parallel) + mocker.patch("buildstockbatch.eagle.subprocess") + + mocker.patch.object(EagleBatch, "run_building", raise_error) + mocker.patch.object(EagleBatch, "local_output_dir", results_dir) + mocker.patch.object(EagleBatch, "results_dir", results_dir) + mocker.patch.object( + EagleBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir" + ) + mocker.patch.object( + EagleBatch, "local_weather_dir", results_dir / "local_weather_dir" + ) + mocker.patch.object( + EagleBatch, + "local_housing_characteristics_dir", + results_dir / "local_housing_characteristics_dir", + ) b = EagleBatch(project_filename) b.run_job_batch(1) - traceback_file = results_dir / 'simulation_output' / 'traceback1.out' + traceback_file = results_dir / "simulation_output" / "traceback1.out" assert traceback_file.exists() - with open(traceback_file, 'r') as f: - assert f.read().find('RuntimeError') > -1 + with open(traceback_file, "r") as f: + assert f.read().find("RuntimeError") > -1 def test_rerun_failed_jobs(mocker, basic_residential_project_file): project_filename, results_dir = basic_residential_project_file() - os.makedirs(os.path.join(results_dir, 'results_csvs')) - os.makedirs(os.path.join(results_dir, 'parquet')) - mocker.patch.object(EagleBatch, 'weather_dir', None) - mocker.patch.object(EagleBatch, 'results_dir', results_dir) - process_results_mocker = mocker.patch.object(BuildStockBatchBase, 'process_results') - queue_jobs_mocker = mocker.patch.object(EagleBatch, 'queue_jobs', return_value=[42]) - queue_post_processing_mocker = mocker.patch.object(EagleBatch, 'queue_post_processing') + os.makedirs(os.path.join(results_dir, "results_csvs")) + os.makedirs(os.path.join(results_dir, "parquet")) + mocker.patch.object(EagleBatch, "weather_dir", None) + mocker.patch.object(EagleBatch, "results_dir", results_dir) + process_results_mocker = mocker.patch.object(BuildStockBatchBase, "process_results") + queue_jobs_mocker = mocker.patch.object(EagleBatch, "queue_jobs", return_value=[42]) + queue_post_processing_mocker = mocker.patch.object( + EagleBatch, "queue_post_processing" + ) b = EagleBatch(project_filename) for job_id in range(1, 6): json_filename = os.path.join(b.output_dir, f"job{job_id:03d}.json") - with open(json_filename, 'w') as f: + with open(json_filename, "w") as f: json.dump({}, f) if job_id == 5: continue out_filename = os.path.join(b.output_dir, f"job.out-{job_id}") with open(out_filename, "w") as f: - f.write('lots of output\ngoes\nhere\n') + f.write("lots of output\ngoes\nhere\n") if job_id % 2 == 0: f.write("Traceback") else: f.write("batch complete") - f.write('\n') + f.write("\n") failed_array_ids = b.get_failed_job_array_ids() assert sorted(failed_array_ids) == [2, 4, 5] @@ -391,16 +483,16 @@ def test_rerun_failed_jobs(mocker, basic_residential_project_file): queue_jobs_mocker.reset_mock() queue_post_processing_mocker.assert_called_once_with([42], hipri=False) queue_post_processing_mocker.reset_mock() - assert not os.path.exists(os.path.join(results_dir, 'results_csvs')) - assert not os.path.exists(os.path.join(results_dir, 'parquet')) + assert not os.path.exists(os.path.join(results_dir, "results_csvs")) + assert not os.path.exists(os.path.join(results_dir, "parquet")) for job_id in range(1, 6): json_filename = os.path.join(b.output_dir, f"job{job_id:03d}.json") - with open(json_filename, 'w') as f: + with open(json_filename, "w") as f: json.dump({}, f) out_filename = os.path.join(b.output_dir, f"job.out-{job_id}") with open(out_filename, "w") as f: - f.write('lots of output\ngoes\nhere\n') + f.write("lots of output\ngoes\nhere\n") f.write("batch complete\n") b.process_results() diff --git a/buildstockbatch/test/test_local.py b/buildstockbatch/test/test_local.py index 7778fe6b..5bc7d5ce 100644 --- a/buildstockbatch/test/test_local.py +++ b/buildstockbatch/test/test_local.py @@ -9,15 +9,22 @@ from buildstockbatch.local import LocalBatch from buildstockbatch.utils import get_project_configuration -from buildstockbatch.test.shared_testing_stuff import resstock_directory, resstock_required - - -@pytest.mark.parametrize("project_filename", [ - resstock_directory / "project_national" / "national_baseline.yml", - resstock_directory / "project_national" / "national_upgrades.yml", - resstock_directory / "project_testing" / "testing_baseline.yml", - resstock_directory / "project_testing" / "testing_upgrades.yml", -], ids=lambda x: x.stem) +from buildstockbatch.test.shared_testing_stuff import ( + resstock_directory, + resstock_required, +) + + +@pytest.mark.parametrize( + "project_filename", + [ + resstock_directory / "project_national" / "national_baseline.yml", + resstock_directory / "project_national" / "national_upgrades.yml", + resstock_directory / "project_testing" / "testing_baseline.yml", + resstock_directory / "project_testing" / "testing_upgrades.yml", + ], + ids=lambda x: x.stem, +) @resstock_required def test_resstock_local_batch(project_filename): LocalBatch.validate_project(str(project_filename)) @@ -37,7 +44,11 @@ def test_resstock_local_batch(project_filename): n_datapoints = 2 batch.cfg["sampler"]["args"]["n_datapoints"] = n_datapoints - local_weather_file = resstock_directory.parent / "weather" / batch.cfg["weather_files_url"].split("/")[-1] + local_weather_file = ( + resstock_directory.parent + / "weather" + / batch.cfg["weather_files_url"].split("/")[-1] + ) if local_weather_file.exists(): del batch.cfg["weather_files_url"] batch.cfg["weather_files_path"] = str(local_weather_file) @@ -52,7 +63,12 @@ def test_resstock_local_batch(project_filename): for upgrade_id in range(0, n_upgrades + 1): for bldg_id in range(1, n_datapoints + 1): - assert (simout_path / "timeseries" / f"up{upgrade_id:02d}" / f"bldg{bldg_id:07d}.parquet").exists() + assert ( + simout_path + / "timeseries" + / f"up{upgrade_id:02d}" + / f"bldg{bldg_id:07d}.parquet" + ).exists() batch.process_results() @@ -67,9 +83,17 @@ def test_resstock_local_batch(project_filename): ts_pq_path = out_path / "parquet" / "timeseries" for upgrade_id in range(0, n_upgrades + 1): assert (ts_pq_path / f"upgrade={upgrade_id}" / "group0.parquet").exists() - assert (out_path / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz").exists() + assert ( + out_path / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz" + ).exists() if upgrade_id >= 1: - upg_pq = out_path / "parquet" / "upgrades" / f"upgrade={upgrade_id}" / f"results_up{upgrade_id:02d}.parquet" + upg_pq = ( + out_path + / "parquet" + / "upgrades" + / f"upgrade={upgrade_id}" + / f"results_up{upgrade_id:02d}.parquet" + ) assert upg_pq.exists() upg = pd.read_parquet(upg_pq, columns=["completed_status"]) assert (upg["completed_status"] == "Success").all() @@ -87,11 +111,13 @@ def mocked_subprocess_run(run_cmd, **kwargs): assert "timeout" in kwargs.keys() raise subprocess.TimeoutExpired(run_cmd, kwargs["timeout"]) - mocker.patch('buildstockbatch.local.subprocess.run', mocked_subprocess_run) - sleep_mock = mocker.patch('buildstockbatch.local.time.sleep') + mocker.patch("buildstockbatch.local.subprocess.run", mocked_subprocess_run) + sleep_mock = mocker.patch("buildstockbatch.local.time.sleep") - cfg = get_project_configuration(resstock_directory / "project_national" / "national_baseline.yml") - cfg['max_minutes_per_sim'] = 5 + cfg = get_project_configuration( + resstock_directory / "project_national" / "national_baseline.yml" + ) + cfg["max_minutes_per_sim"] = 5 with tempfile.TemporaryDirectory() as tmpdir: LocalBatch.run_building( @@ -99,16 +125,16 @@ def mocked_subprocess_run(run_cmd, **kwargs): str(resstock_directory / "weather"), tmpdir, measures_only=False, - n_datapoints=cfg['sampler']['args']['n_datapoints'], + n_datapoints=cfg["sampler"]["args"]["n_datapoints"], cfg=cfg, - i=1 + i=1, ) - sim_path = pathlib.Path(tmpdir, 'simulation_output', 'up00', 'bldg0000001') + sim_path = pathlib.Path(tmpdir, "simulation_output", "up00", "bldg0000001") assert sim_path.is_dir() msg_re = re.compile(r"Terminated \w+ after reaching max time") - with open(sim_path / 'openstudio_output.log', 'r') as f: + with open(sim_path / "openstudio_output.log", "r") as f: os_output = f.read() assert msg_re.search(os_output) @@ -119,10 +145,12 @@ def mocked_subprocess_run(run_cmd, **kwargs): assert out_osw["completed_status"] == "Fail" assert msg_re.search(out_osw["timeout"]) - err_log_re = re.compile(r"\[\d\d:\d\d:\d\d ERROR\] Terminated \w+ after reaching max time") - with open(sim_path / 'run' / 'run.log', 'r') as run_log: + err_log_re = re.compile( + r"\[\d\d:\d\d:\d\d ERROR\] Terminated \w+ after reaching max time" + ) + with open(sim_path / "run" / "run.log", "r") as run_log: err_log_re.search(run_log.read()) - with open(sim_path / 'run' / 'failed.job', 'r') as failed_job: + with open(sim_path / "run" / "failed.job", "r") as failed_job: err_log_re.search(failed_job.read()) sleep_mock.assert_called_once_with(20) diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py index 7e6e2b6e..d11fdb86 100644 --- a/buildstockbatch/test/test_postprocessing.py +++ b/buildstockbatch/test/test_postprocessing.py @@ -18,38 +18,37 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): - reporting_measures = [ - 'ReportingMeasure1', - 'ReportingMeasure2' - ] - project_filename, results_dir = basic_residential_project_file({ - 'reporting_measures': reporting_measures - }) + reporting_measures = ["ReportingMeasure1", "ReportingMeasure2"] + project_filename, results_dir = basic_residential_project_file( + {"reporting_measures": reporting_measures} + ) fs = LocalFileSystem() results_dir = pathlib.Path(results_dir) - sim_out_dir = results_dir / 'simulation_output' - with tarfile.open(sim_out_dir / 'simulations_job0.tar.gz', 'r') as tarf: + sim_out_dir = results_dir / "simulation_output" + with tarfile.open(sim_out_dir / "simulations_job0.tar.gz", "r") as tarf: tarf.extractall(sim_out_dir) dpouts2 = [] - for filename in sim_out_dir.rglob('data_point_out.json'): - with filename.open('rt', encoding='utf-8') as f: + for filename in sim_out_dir.rglob("data_point_out.json"): + with filename.open("rt", encoding="utf-8") as f: dpout = json.load(f) - dpout['ReportingMeasure1'] = {'column_1': 1, 'column_2': 2} - dpout['ReportingMeasure2'] = {'column_3': 3, 'column_4': 4} - with filename.open('wt', encoding='utf-8') as f: + dpout["ReportingMeasure1"] = {"column_1": 1, "column_2": 2} + dpout["ReportingMeasure2"] = {"column_3": 3, "column_4": 4} + with filename.open("wt", encoding="utf-8") as f: json.dump(dpout, f) sim_dir = str(filename.parent.parent) - upgrade_id = int(re.search(r'up(\d+)', sim_dir).group(1)) - building_id = int(re.search(r'bldg(\d+)', sim_dir).group(1)) + upgrade_id = int(re.search(r"up(\d+)", sim_dir).group(1)) + building_id = int(re.search(r"bldg(\d+)", sim_dir).group(1)) dpouts2.append( - postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, building_id) + postprocessing.read_simulation_outputs( + fs, reporting_measures, sim_dir, upgrade_id, building_id + ) ) - with gzip.open(sim_out_dir / 'results_job0.json.gz', 'wt', encoding='utf-8') as f: + with gzip.open(sim_out_dir / "results_job0.json.gz", "wt", encoding="utf-8") as f: json.dump(dpouts2, f) cfg = get_project_configuration(project_filename) @@ -57,11 +56,13 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False) for upgrade_id in (0, 1): - df = read_csv(str(results_dir / 'results_csvs' / f'results_up{upgrade_id:02d}.csv.gz')) - assert (df['reporting_measure1.column_1'] == 1).all() - assert (df['reporting_measure1.column_2'] == 2).all() - assert (df['reporting_measure2.column_3'] == 3).all() - assert (df['reporting_measure2.column_4'] == 4).all() + df = read_csv( + str(results_dir / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz") + ) + assert (df["reporting_measure1.column_1"] == 1).all() + assert (df["reporting_measure1.column_2"] == 2).all() + assert (df["reporting_measure2.column_3"] == 3).all() + assert (df["reporting_measure2.column_4"] == 4).all() def test_empty_results_assertion(basic_residential_project_file, capsys): @@ -69,11 +70,13 @@ def test_empty_results_assertion(basic_residential_project_file, capsys): fs = LocalFileSystem() results_dir = pathlib.Path(results_dir) - sim_out_dir = results_dir / 'simulation_output' + sim_out_dir = results_dir / "simulation_output" shutil.rmtree(sim_out_dir) # no results cfg = get_project_configuration(project_filename) - with pytest.raises(ValueError, match=r'No simulation results found to post-process'): + with pytest.raises( + ValueError, match=r"No simulation results found to post-process" + ): assert postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False) @@ -83,51 +86,54 @@ def test_large_parquet_combine(basic_residential_project_file): project_filename, results_dir = basic_residential_project_file() - with patch.object(BuildStockBatchBase, 'weather_dir', None), \ - patch.object(BuildStockBatchBase, 'get_dask_client'), \ - patch.object(BuildStockBatchBase, 'results_dir', results_dir),\ - patch.object(postprocessing, 'MAX_PARQUET_MEMORY', 1): # set the max memory to just 1MB + with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( + BuildStockBatchBase, "get_dask_client" + ), patch.object(BuildStockBatchBase, "results_dir", results_dir), patch.object( + postprocessing, "MAX_PARQUET_MEMORY", 1 + ): # set the max memory to just 1MB bsb = BuildStockBatchBase(project_filename) bsb.process_results() # this would raise exception if the postprocessing could not handle the situation -@pytest.mark.parametrize('keep_individual_timeseries', [True, False]) -def test_keep_individual_timeseries(keep_individual_timeseries, basic_residential_project_file, mocker): - project_filename, results_dir = basic_residential_project_file({ - 'postprocessing': { - 'keep_individual_timeseries': keep_individual_timeseries - } - }) +@pytest.mark.parametrize("keep_individual_timeseries", [True, False]) +def test_keep_individual_timeseries( + keep_individual_timeseries, basic_residential_project_file, mocker +): + project_filename, results_dir = basic_residential_project_file( + {"postprocessing": {"keep_individual_timeseries": keep_individual_timeseries}} + ) - mocker.patch.object(BuildStockBatchBase, 'weather_dir', None) - mocker.patch.object(BuildStockBatchBase, 'get_dask_client') - mocker.patch.object(BuildStockBatchBase, 'results_dir', results_dir) + mocker.patch.object(BuildStockBatchBase, "weather_dir", None) + mocker.patch.object(BuildStockBatchBase, "get_dask_client") + mocker.patch.object(BuildStockBatchBase, "results_dir", results_dir) bsb = BuildStockBatchBase(project_filename) bsb.process_results() results_path = pathlib.Path(results_dir) - simout_path = results_path / 'simulation_output' - assert len(list(simout_path.glob('results_job*.json.gz'))) == 0 + simout_path = results_path / "simulation_output" + assert len(list(simout_path.glob("results_job*.json.gz"))) == 0 - ts_path = simout_path / 'timeseries' + ts_path = simout_path / "timeseries" assert ts_path.exists() == keep_individual_timeseries def test_upgrade_missing_ts(basic_residential_project_file, mocker, caplog): - caplog.set_level(logging.WARNING, logger='buildstockbatch.postprocessing') + caplog.set_level(logging.WARNING, logger="buildstockbatch.postprocessing") project_filename, results_dir = basic_residential_project_file() results_path = pathlib.Path(results_dir) - for filename in (results_path / 'simulation_output' / 'timeseries' / 'up01').glob('*.parquet'): + for filename in (results_path / "simulation_output" / "timeseries" / "up01").glob( + "*.parquet" + ): os.remove(filename) - mocker.patch.object(BuildStockBatchBase, 'weather_dir', None) - mocker.patch.object(BuildStockBatchBase, 'get_dask_client') - mocker.patch.object(BuildStockBatchBase, 'results_dir', results_dir) + mocker.patch.object(BuildStockBatchBase, "weather_dir", None) + mocker.patch.object(BuildStockBatchBase, "get_dask_client") + mocker.patch.object(BuildStockBatchBase, "results_dir", results_dir) bsb = BuildStockBatchBase(project_filename) bsb.process_results() assert len(caplog.records) == 1 record = caplog.records[0] - assert record.levelname == 'WARNING' - assert record.message == 'There are no timeseries files for upgrade1.' + assert record.levelname == "WARNING" + assert record.message == "There are no timeseries files for upgrade1." diff --git a/buildstockbatch/test/test_utils.py b/buildstockbatch/test/test_utils.py index 62c5d215..096d5f81 100644 --- a/buildstockbatch/test/test_utils.py +++ b/buildstockbatch/test/test_utils.py @@ -5,21 +5,32 @@ def test_str_repr(): - test_obj = [{1, 2, 3, 4, 5, 6}, {"List1": ["Item1", ('a', 'b', 'c', 'd'), "item3"], - "long_name_list": ["long_name_one_two_three", "long_name"], - "dict": {"key1": ["List_item1", "List_item2", "List_item3"], "Key2": "value2", - "key3": "value3", "key4": "val4"}}] + test_obj = [ + {1, 2, 3, 4, 5, 6}, + { + "List1": ["Item1", ("a", "b", "c", "d"), "item3"], + "long_name_list": ["long_name_one_two_three", "long_name"], + "dict": { + "key1": ["List_item1", "List_item2", "List_item3"], + "Key2": "value2", + "key3": "value3", + "key4": "val4", + }, + }, + ] gen_repr = _str_repr(test_obj, list_max=2, dict_max=3, string_max=10) - true_repr = "[{'1','2','3' ...6},{'List1': ['Item1',('a','b' ...4) ...3],'long_...14..._list': ['long_...23..."\ - "three','long_name'],'dict': {'key1': ['List_item1','List_item2' ...3],'Key2': 'value2',"\ - "'key3': 'value3' ...4}}]" + true_repr = ( + "[{'1','2','3' ...6},{'List1': ['Item1',('a','b' ...4) ...3],'long_...14..._list': ['long_...23..." + "three','long_name'],'dict': {'key1': ['List_item1','List_item2' ...3],'Key2': 'value2'," + "'key3': 'value3' ...4}}]" + ) assert true_repr == gen_repr def test_get_error_details(): - tf = tempfile.NamedTemporaryFile('w+', delete=False) + tf = tempfile.NamedTemporaryFile("w+", delete=False) tf.close() @log_error_details(tf.name) @@ -40,7 +51,7 @@ def failing_function2(arg2): failing_function1("my_arg1") assert "actual dummy exception" in str(ex_info.value) - with open(tf.name, 'r') as f: + with open(tf.name, "r") as f: error_log = f.read() assert "'arg1':'my_arg1'" in error_log assert "'level_1_string':'string1_my_arg1'" in error_log diff --git a/buildstockbatch/test/test_validation.py b/buildstockbatch/test/test_validation.py index b318bb75..e2c849e6 100644 --- a/buildstockbatch/test/test_validation.py +++ b/buildstockbatch/test/test_validation.py @@ -21,7 +21,10 @@ from buildstockbatch.aws.aws import AwsBatch from buildstockbatch.local import LocalBatch from buildstockbatch.base import BuildStockBatchBase, ValidationError -from buildstockbatch.test.shared_testing_stuff import resstock_directory, resstock_required +from buildstockbatch.test.shared_testing_stuff import ( + resstock_directory, + resstock_required, +) from buildstockbatch.utils import get_project_configuration from unittest.mock import patch from testfixtures import LogCapture @@ -30,12 +33,14 @@ import yaml here = os.path.dirname(os.path.abspath(__file__)) -example_yml_dir = os.path.join(here, 'test_inputs') -resources_dir = os.path.join(here, 'test_inputs', 'test_openstudio_buildstock', 'resources') +example_yml_dir = os.path.join(here, "test_inputs") +resources_dir = os.path.join( + here, "test_inputs", "test_openstudio_buildstock", "resources" +) def filter_logs(logs, level): - filtered_logs = '' + filtered_logs = "" for record in logs.records: if record.levelname == level: filtered_logs += record.msg @@ -64,31 +69,41 @@ def test_aws_batch_validation_is_static(): def test_complete_schema_passes_validation(): - assert BuildStockBatchBase.validate_project_schema(os.path.join(example_yml_dir, 'complete-schema.yml')) + assert BuildStockBatchBase.validate_project_schema( + os.path.join(example_yml_dir, "complete-schema.yml") + ) def test_minimal_schema_passes_validation(): - assert BuildStockBatchBase.validate_project_schema(os.path.join(example_yml_dir, 'minimal-schema.yml')) - - -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'missing-required-schema.yml'), - os.path.join(example_yml_dir, 'missing-nested-required-schema.yml') -]) + assert BuildStockBatchBase.validate_project_schema( + os.path.join(example_yml_dir, "minimal-schema.yml") + ) + + +@pytest.mark.parametrize( + "project_file", + [ + os.path.join(example_yml_dir, "missing-required-schema.yml"), + os.path.join(example_yml_dir, "missing-nested-required-schema.yml"), + ], +) def test_missing_required_key_fails(project_file): # patch the validate_options_lookup function to always return true for this case - with patch.object(BuildStockBatchBase, 'validate_options_lookup', lambda _: True): + with patch.object(BuildStockBatchBase, "validate_options_lookup", lambda _: True): with pytest.raises(ValueError): BuildStockBatchBase.validate_project_schema(project_file) -@pytest.mark.parametrize("project_file,expected", [ - (os.path.join(example_yml_dir, 'enforce-schema-xor.yml'), ValidationError), - (os.path.join(example_yml_dir, 'enforce-schema-xor-and-passes.yml'), True), -]) +@pytest.mark.parametrize( + "project_file,expected", + [ + (os.path.join(example_yml_dir, "enforce-schema-xor.yml"), ValidationError), + (os.path.join(example_yml_dir, "enforce-schema-xor-and-passes.yml"), True), + ], +) def test_xor_violations_fail(project_file, expected): # patch the validate_options_lookup function to always return true for this case - with patch.object(BuildStockBatchBase, 'validate_options_lookup', lambda _: True): + with patch.object(BuildStockBatchBase, "validate_options_lookup", lambda _: True): if expected is not True: with pytest.raises(expected): BuildStockBatchBase.validate_xor_nor_schema_keys(project_file) @@ -96,21 +111,45 @@ def test_xor_violations_fail(project_file, expected): assert BuildStockBatchBase.validate_xor_nor_schema_keys(project_file) -@pytest.mark.parametrize("project_file, base_expected, eagle_expected", [ - (os.path.join(example_yml_dir, 'missing-required-schema.yml'), ValueError, ValueError), - (os.path.join(example_yml_dir, 'missing-nested-required-schema.yml'), ValueError, ValueError), - (os.path.join(example_yml_dir, 'enforce-schema-xor.yml'), ValidationError, ValidationError), - (os.path.join(example_yml_dir, 'complete-schema.yml'), True, True), - (os.path.join(example_yml_dir, 'minimal-schema.yml'), True, ValidationError) -]) +@pytest.mark.parametrize( + "project_file, base_expected, eagle_expected", + [ + ( + os.path.join(example_yml_dir, "missing-required-schema.yml"), + ValueError, + ValueError, + ), + ( + os.path.join(example_yml_dir, "missing-nested-required-schema.yml"), + ValueError, + ValueError, + ), + ( + os.path.join(example_yml_dir, "enforce-schema-xor.yml"), + ValidationError, + ValidationError, + ), + (os.path.join(example_yml_dir, "complete-schema.yml"), True, True), + (os.path.join(example_yml_dir, "minimal-schema.yml"), True, ValidationError), + ], +) def test_validation_integration(project_file, base_expected, eagle_expected): # patch the validate_options_lookup function to always return true for this case - with patch.object(BuildStockBatchBase, 'validate_options_lookup', lambda _: True), \ - patch.object(BuildStockBatchBase, 'validate_measure_references', lambda _: True), \ - patch.object(BuildStockBatchBase, 'validate_workflow_generator', lambda _: True), \ - patch.object(BuildStockBatchBase, 'validate_postprocessing_spec', lambda _: True), \ - patch.object(EagleBatch, 'validate_singularity_image_eagle', lambda _: True): - for cls, expected in [(BuildStockBatchBase, base_expected), (EagleBatch, eagle_expected)]: + with patch.object( + BuildStockBatchBase, "validate_options_lookup", lambda _: True + ), patch.object( + BuildStockBatchBase, "validate_measure_references", lambda _: True + ), patch.object( + BuildStockBatchBase, "validate_workflow_generator", lambda _: True + ), patch.object( + BuildStockBatchBase, "validate_postprocessing_spec", lambda _: True + ), patch.object( + EagleBatch, "validate_singularity_image_eagle", lambda _: True + ): + for cls, expected in [ + (BuildStockBatchBase, base_expected), + (EagleBatch, eagle_expected), + ]: if expected is not True: with pytest.raises(expected): cls.validate_project(project_file) @@ -118,32 +157,35 @@ def test_validation_integration(project_file, base_expected, eagle_expected): assert cls.validate_project(project_file) -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-measures-bad-2.yml') -]) +@pytest.mark.parametrize( + "project_file", + [os.path.join(example_yml_dir, "enforce-validate-measures-bad-2.yml")], +) def test_bad_reference_scenario(project_file): with LogCapture(level=logging.INFO) as logs: BuildStockBatchBase.validate_reference_scenario(project_file) - warning_logs = filter_logs(logs, 'WARNING') + warning_logs = filter_logs(logs, "WARNING") assert "non-existing upgrade' does not match " in warning_logs -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-measures-good-2.yml') -]) +@pytest.mark.parametrize( + "project_file", + [os.path.join(example_yml_dir, "enforce-validate-measures-good-2.yml")], +) def test_good_reference_scenario(project_file): with LogCapture(level=logging.INFO) as logs: assert BuildStockBatchBase.validate_reference_scenario(project_file) - warning_logs = filter_logs(logs, 'WARNING') - error_logs = filter_logs(logs, 'ERROR') - assert warning_logs == '' - assert error_logs == '' + warning_logs = filter_logs(logs, "WARNING") + error_logs = filter_logs(logs, "ERROR") + assert warning_logs == "" + assert error_logs == "" -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-measures-bad-2.yml') -]) +@pytest.mark.parametrize( + "project_file", + [os.path.join(example_yml_dir, "enforce-validate-measures-bad-2.yml")], +) def test_bad_measures(project_file): with LogCapture(level=logging.INFO) as _: @@ -152,46 +194,65 @@ def test_bad_measures(project_file): except (ValidationError, YamaleError) as er: er = str(er) assert "'1.5' is not a int" in er - assert "'huorly' not in ('none', 'timestep', 'hourly', 'daily', 'monthly')" in er + assert ( + "'huorly' not in ('none', 'timestep', 'hourly', 'daily', 'monthly')" + in er + ) else: - raise Exception("measures_and_arguments was supposed to raise ValidationError for" - " enforce-validate-measures-bad.yml") - - -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-measures-good-2.yml'), - os.path.join(example_yml_dir, 'enforce-validate-measures-good-2-with-anchors.yml') -]) + raise Exception( + "measures_and_arguments was supposed to raise ValidationError for" + " enforce-validate-measures-bad.yml" + ) + + +@pytest.mark.parametrize( + "project_file", + [ + os.path.join(example_yml_dir, "enforce-validate-measures-good-2.yml"), + os.path.join( + example_yml_dir, "enforce-validate-measures-good-2-with-anchors.yml" + ), + ], +) def test_good_measures(project_file): with LogCapture(level=logging.INFO) as logs: assert BuildStockBatchBase.validate_workflow_generator(project_file) - warning_logs = filter_logs(logs, 'WARNING') - error_logs = filter_logs(logs, 'ERROR') - assert warning_logs == '' - assert error_logs == '' - - -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-options-wrong-path.yml'), -]) + warning_logs = filter_logs(logs, "WARNING") + error_logs = filter_logs(logs, "ERROR") + assert warning_logs == "" + assert error_logs == "" + + +@pytest.mark.parametrize( + "project_file", + [ + os.path.join(example_yml_dir, "enforce-validate-options-wrong-path.yml"), + ], +) def test_bad_path_options_validation(project_file): with pytest.raises(FileNotFoundError): BuildStockBatchBase.validate_options_lookup(project_file) -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-options-good.yml'), - os.path.join(example_yml_dir, 'enforce-validate-options-good-2.yml'), -]) +@pytest.mark.parametrize( + "project_file", + [ + os.path.join(example_yml_dir, "enforce-validate-options-good.yml"), + os.path.join(example_yml_dir, "enforce-validate-options-good-2.yml"), + ], +) def test_good_options_validation(project_file): assert BuildStockBatchBase.validate_options_lookup(project_file) assert BuildStockBatchBase.validate_postprocessing_spec(project_file) -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-options-bad.yml'), - os.path.join(example_yml_dir, 'enforce-validate-options-bad-2.yml'), -]) +@pytest.mark.parametrize( + "project_file", + [ + os.path.join(example_yml_dir, "enforce-validate-options-bad.yml"), + os.path.join(example_yml_dir, "enforce-validate-options-bad-2.yml"), + ], +) def test_bad_options_validation(project_file): try: BuildStockBatchBase.validate_options_lookup(project_file) @@ -215,19 +276,27 @@ def test_bad_options_validation(project_file): assert "Floor Insulation: '*' cannot be mixed with other options" in er else: - raise Exception("validate_options was supposed to raise ValueError for enforce-validate-options-bad.yml") + raise Exception( + "validate_options was supposed to raise ValueError for enforce-validate-options-bad.yml" + ) -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-measures-good.yml'), -]) +@pytest.mark.parametrize( + "project_file", + [ + os.path.join(example_yml_dir, "enforce-validate-measures-good.yml"), + ], +) def test_good_measures_validation(project_file): assert BuildStockBatchBase.validate_measure_references(project_file) -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-measures-bad.yml'), -]) +@pytest.mark.parametrize( + "project_file", + [ + os.path.join(example_yml_dir, "enforce-validate-measures-bad.yml"), + ], +) def test_bad_measures_validation(project_file): try: BuildStockBatchBase.validate_measure_references(project_file) @@ -239,13 +308,18 @@ def test_bad_measures_validation(project_file): assert "ResidentialConstructionsFinishedBasement" in er else: - raise Exception("validate_measure_references was supposed to raise ValueError for " - "enforce-validate-measures-bad.yml") - - -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-options-bad-2.yml'), -]) + raise Exception( + "validate_measure_references was supposed to raise ValueError for " + "enforce-validate-measures-bad.yml" + ) + + +@pytest.mark.parametrize( + "project_file", + [ + os.path.join(example_yml_dir, "enforce-validate-options-bad-2.yml"), + ], +) def test_bad_postprocessing_spec_validation(project_file): try: BuildStockBatchBase.validate_postprocessing_spec(project_file) @@ -253,12 +327,14 @@ def test_bad_postprocessing_spec_validation(project_file): er = str(er) assert "bad_partition_column" in er else: - raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-bad-2.yml") + raise Exception( + "validate_options was supposed to raise ValidationError for enforce-validate-options-bad-2.yml" + ) -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-options-good.yml') -]) +@pytest.mark.parametrize( + "project_file", [os.path.join(example_yml_dir, "enforce-validate-options-good.yml")] +) def test_logic_validation_fail(project_file): try: BuildStockBatchBase.validate_logic(project_file) @@ -268,12 +344,15 @@ def test_logic_validation_fail(project_file): assert "'Vintage' occurs 2 times in a 'and' block" in er assert "'Vintage' occurs 2 times in a '&&' block" in er else: - raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml") + raise Exception( + "validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml" + ) -@pytest.mark.parametrize("project_file", [ - os.path.join(example_yml_dir, 'enforce-validate-options-all-good.yml') -]) +@pytest.mark.parametrize( + "project_file", + [os.path.join(example_yml_dir, "enforce-validate-options-all-good.yml")], +) def test_logic_validation_pass(project_file): BuildStockBatchBase.validate_logic(project_file) @@ -283,7 +362,9 @@ def test_number_of_options_apply_upgrade(): proj_filename = resstock_directory / "project_national" / "national_upgrades.yml" cfg = get_project_configuration(str(proj_filename)) cfg["upgrades"][-1]["options"] = cfg["upgrades"][-1]["options"] * 10 - cfg["upgrades"][0]["options"][0]["costs"] = cfg["upgrades"][0]["options"][0]["costs"] * 5 + cfg["upgrades"][0]["options"][0]["costs"] = ( + cfg["upgrades"][0]["options"][0]["costs"] * 5 + ) with tempfile.TemporaryDirectory() as tmpdir: tmppath = pathlib.Path(tmpdir) new_proj_filename = tmppath / "project.yml" @@ -303,7 +384,7 @@ def test_validate_resstock_or_comstock_version(mocker): def test_dask_config(): - orig_filename = os.path.join(example_yml_dir, 'minimal-schema.yml') + orig_filename = os.path.join(example_yml_dir, "minimal-schema.yml") cfg = get_project_configuration(orig_filename) with tempfile.TemporaryDirectory() as tmpdir: cfg["aws"] = { @@ -312,7 +393,7 @@ def test_dask_config(): "scheduler_memory": 2048, "worker_cpu": 1024, "worker_memory": 2048, - "n_workers": 1 + "n_workers": 1, } } test1_filename = os.path.join(tmpdir, "test1.yml") @@ -335,68 +416,93 @@ def test_dask_config(): def test_validate_eagle_output_directory(): - minimal_yml = pathlib.Path(example_yml_dir, 'minimal-schema.yml') + minimal_yml = pathlib.Path(example_yml_dir, "minimal-schema.yml") with pytest.raises(ValidationError, match=r"must be in /scratch or /projects"): EagleBatch.validate_output_directory_eagle(str(minimal_yml)) with tempfile.TemporaryDirectory() as tmpdir: dirs_to_try = [ - '/scratch/username/out_dir', - '/projects/projname/out_dir', - '/lustre/eaglefs/scratch/username/out_dir', - '/lustre/eaglefs/projects/projname/out_dir' + "/scratch/username/out_dir", + "/projects/projname/out_dir", + "/lustre/eaglefs/scratch/username/out_dir", + "/lustre/eaglefs/projects/projname/out_dir", ] for output_directory in dirs_to_try: - with open(minimal_yml, 'r') as f: + with open(minimal_yml, "r") as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) - cfg['output_directory'] = output_directory - temp_yml = pathlib.Path(tmpdir, 'temp.yml') - with open(temp_yml, 'w') as f: + cfg["output_directory"] = output_directory + temp_yml = pathlib.Path(tmpdir, "temp.yml") + with open(temp_yml, "w") as f: yaml.dump(cfg, f, Dumper=yaml.SafeDumper) EagleBatch.validate_output_directory_eagle(str(temp_yml)) def test_validate_singularity_image_eagle(mocker, basic_residential_project_file): - minimal_yml = pathlib.Path(example_yml_dir, 'minimal-schema.yml') + minimal_yml = pathlib.Path(example_yml_dir, "minimal-schema.yml") with tempfile.TemporaryDirectory() as tmpdir: - with open(minimal_yml, 'r') as f: + with open(minimal_yml, "r") as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) - cfg['sys_image_dir'] = tmpdir - temp_yml = pathlib.Path(tmpdir, 'temp.yml') - with open(temp_yml, 'w') as f: + cfg["sys_image_dir"] = tmpdir + temp_yml = pathlib.Path(tmpdir, "temp.yml") + with open(temp_yml, "w") as f: yaml.dump(cfg, f, Dumper=yaml.SafeDumper) with pytest.raises(ValidationError, match=r"image does not exist"): EagleBatch.validate_singularity_image_eagle(str(temp_yml)) def test_validate_sampler_good_buildstock(basic_residential_project_file): - project_filename, _ = basic_residential_project_file({ - 'sampler': { - 'type': 'precomputed', - 'args': { - 'sample_file': str(os.path.join(resources_dir, 'buildstock_good.csv')) + project_filename, _ = basic_residential_project_file( + { + "sampler": { + "type": "precomputed", + "args": { + "sample_file": str( + os.path.join(resources_dir, "buildstock_good.csv") + ) + }, } } - }) + ) assert BuildStockBatchBase.validate_sampler(project_filename) def test_validate_sampler_bad_buildstock(basic_residential_project_file): - project_filename, _ = basic_residential_project_file({ - 'sampler': { - 'type': 'precomputed', - 'args': { - 'sample_file': str(os.path.join(resources_dir, 'buildstock_bad.csv')) + project_filename, _ = basic_residential_project_file( + { + "sampler": { + "type": "precomputed", + "args": { + "sample_file": str( + os.path.join(resources_dir, "buildstock_bad.csv") + ) + }, } } - }) + ) try: BuildStockBatchBase.validate_sampler(project_filename) except ValidationError as er: er = str(er) - assert 'Option 1940-1950 in column Vintage of buildstock_csv is not available in options_lookup.tsv' in er - assert 'Option TX in column State of buildstock_csv is not available in options_lookup.tsv' in er - assert 'Option nan in column Insulation Wall of buildstock_csv is not available in options_lookup.tsv' in er - assert 'Column Insulation in buildstock_csv is not available in options_lookup.tsv' in er - assert 'Column ZipPlusCode in buildstock_csv is not available in options_lookup.tsv' in er + assert ( + "Option 1940-1950 in column Vintage of buildstock_csv is not available in options_lookup.tsv" + in er + ) + assert ( + "Option TX in column State of buildstock_csv is not available in options_lookup.tsv" + in er + ) + assert ( + "Option nan in column Insulation Wall of buildstock_csv is not available in options_lookup.tsv" + in er + ) + assert ( + "Column Insulation in buildstock_csv is not available in options_lookup.tsv" + in er + ) + assert ( + "Column ZipPlusCode in buildstock_csv is not available in options_lookup.tsv" + in er + ) else: - raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml") + raise Exception( + "validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml" + ) diff --git a/buildstockbatch/utils.py b/buildstockbatch/utils.py index ca741fa5..848ceb79 100644 --- a/buildstockbatch/utils.py +++ b/buildstockbatch/utils.py @@ -18,7 +18,12 @@ class ContainerRuntime(enum.Enum): def read_csv(csv_file_path, **kwargs) -> pd.DataFrame: default_na_values = pd._libs.parsers.STR_NA_VALUES - df = pd.read_csv(csv_file_path, na_values=list(default_na_values - {"None", "NA"}), keep_default_na=False, **kwargs) + df = pd.read_csv( + csv_file_path, + na_values=list(default_na_values - {"None", "NA"}), + keep_default_na=False, + **kwargs, + ) return df @@ -34,16 +39,20 @@ def get_project_configuration(project_file): with open(project_file) as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) except FileNotFoundError as err: - logger.error('Failed to load input yaml for validation') + logger.error("Failed to load input yaml for validation") raise err # Set absolute paths - cfg['buildstock_directory'] = path_rel_to_file(project_file, cfg['buildstock_directory']) + cfg["buildstock_directory"] = path_rel_to_file( + project_file, cfg["buildstock_directory"] + ) # if 'precomputed_sample' in cfg.get('baseline', {}): # cfg['baseline']['precomputed_sample'] = \ # path_rel_to_file(project_file, cfg['baseline']['precomputed_sample']) - if 'weather_files_path' in cfg: - cfg['weather_files_path'] = path_rel_to_file(project_file, cfg['weather_files_path']) + if "weather_files_path" in cfg: + cfg["weather_files_path"] = path_rel_to_file( + project_file, cfg["weather_files_path"] + ) return cfg @@ -57,28 +66,48 @@ def _str_repr(obj, list_max=20, dict_max=20, string_max=100): elif type(obj) in [int, float]: return _str_repr(str(obj), list_max, dict_max, string_max) elif type(obj) is list: - txt = "[" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:list_max]]) + txt = "[" + ",".join( + [ + _str_repr(item, list_max, dict_max, string_max) + for item in obj[0:list_max] + ] + ) if len(obj) > list_max: txt += f" ...{len(obj)}" txt += "]" return txt elif type(obj) is tuple: - txt = "(" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:list_max]]) + txt = "(" + ",".join( + [ + _str_repr(item, list_max, dict_max, string_max) + for item in obj[0:list_max] + ] + ) if len(obj) > list_max: txt += f" ...{len(obj)}" txt += ")" return txt elif type(obj) is set: obj = list(obj) - txt = "{" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:dict_max]]) + txt = "{" + ",".join( + [ + _str_repr(item, list_max, dict_max, string_max) + for item in obj[0:dict_max] + ] + ) if len(obj) > dict_max: txt += f" ...{len(obj)}" txt += "}" return txt elif type(obj) is dict: keys = list(obj.keys()) - txt = "{" + ",".join([f"{_str_repr(key, list_max, dict_max, string_max)}:" - f" {_str_repr(obj[key], list_max, dict_max, string_max)}" for key in keys[0:dict_max]]) + txt = "{" + ",".join( + [ + f"{_str_repr(key, list_max, dict_max, string_max)}:" + f" {_str_repr(obj[key], list_max, dict_max, string_max)}" + for key in keys[0:dict_max] + ] + ) if len(keys) > dict_max: txt += f" ...{len(keys)}" txt += "}" @@ -92,7 +121,7 @@ def get_error_details(): text += traceback.format_exc() frames = inspect.trace() for frame in frames: - text += f'\nIn file: {frame[1]}, module {str(frame[3])} line: {frame[2]} \n' + text += f"\nIn file: {frame[1]}, module {str(frame[3])} line: {frame[2]} \n" text += "Local Variables: " for var, value in frame[0].f_locals.items(): text += _str_repr(var) + ":" + _str_repr(value) @@ -111,6 +140,7 @@ def run_with_error_capture(*args, **kwargs): text += get_error_details() f.write(text) raise + return run_with_error_capture return log_error_decorator diff --git a/buildstockbatch/workflow_generator/__init__.py b/buildstockbatch/workflow_generator/__init__.py index 7f1c991f..0e40b889 100644 --- a/buildstockbatch/workflow_generator/__init__.py +++ b/buildstockbatch/workflow_generator/__init__.py @@ -1,4 +1,4 @@ # -*- coding: utf-8 -*- from .commercial import CommercialDefaultWorkflowGenerator # noqa F041 -from .residential_hpxml import ResidentialHpxmlWorkflowGenerator # noqa F041 \ No newline at end of file +from .residential_hpxml import ResidentialHpxmlWorkflowGenerator # noqa F041 diff --git a/buildstockbatch/workflow_generator/base.py b/buildstockbatch/workflow_generator/base.py index dc999c42..4545c65f 100644 --- a/buildstockbatch/workflow_generator/base.py +++ b/buildstockbatch/workflow_generator/base.py @@ -42,17 +42,17 @@ def make_apply_logic_arg(cls, logic): :returns: str of logic """ if isinstance(logic, dict): - assert (len(logic) == 1) + assert len(logic) == 1 key = list(logic.keys())[0] val = logic[key] - if key == 'and': + if key == "and": return cls.make_apply_logic_arg(val) - elif key == 'or': - return '(' + '||'.join(map(cls.make_apply_logic_arg, val)) + ')' - elif key == 'not': - return '!' + cls.make_apply_logic_arg(val) + elif key == "or": + return "(" + "||".join(map(cls.make_apply_logic_arg, val)) + ")" + elif key == "not": + return "!" + cls.make_apply_logic_arg(val) elif isinstance(logic, list): - return '(' + '&&'.join(map(cls.make_apply_logic_arg, logic)) + ')' + return "(" + "&&".join(map(cls.make_apply_logic_arg, logic)) + ")" elif isinstance(logic, str): return logic diff --git a/buildstockbatch/workflow_generator/commercial.py b/buildstockbatch/workflow_generator/commercial.py index 5f488c99..c2676c8d 100644 --- a/buildstockbatch/workflow_generator/commercial.py +++ b/buildstockbatch/workflow_generator/commercial.py @@ -47,28 +47,30 @@ def validate(cls, cfg): measure_dir_name: str(required=True) arguments: map(required=False) """ - workflow_generator_args = cfg['workflow_generator']['args'] - schema_yml = re.sub(r'^ {8}', '', schema_yml, flags=re.MULTILINE) - schema = yamale.make_schema(content=schema_yml, parser='ruamel') - data = yamale.make_data(content=json.dumps(workflow_generator_args), parser='ruamel') + workflow_generator_args = cfg["workflow_generator"]["args"] + schema_yml = re.sub(r"^ {8}", "", schema_yml, flags=re.MULTILINE) + schema = yamale.make_schema(content=schema_yml, parser="ruamel") + data = yamale.make_data( + content=json.dumps(workflow_generator_args), parser="ruamel" + ) return yamale.validate(schema, data, strict=True) def reporting_measures(self): """Return a list of reporting measures to include in outputs""" - workflow_args = self.cfg['workflow_generator'].get('args', {}) + workflow_args = self.cfg["workflow_generator"].get("args", {}) # reporting_measures needs to return the ClassName in measure.rb, but # measure_dir_name in ComStock doesn't always match the ClassName - buildstock_dir = self.cfg['buildstock_directory'] - measures_dir = os.path.join(buildstock_dir, 'measures') + buildstock_dir = self.cfg["buildstock_directory"] + measures_dir = os.path.join(buildstock_dir, "measures") measure_class_names = [] - for m in workflow_args.get('reporting_measures', []): - measure_dir_name = m['measure_dir_name'] + for m in workflow_args.get("reporting_measures", []): + measure_dir_name = m["measure_dir_name"] measure_path = os.path.join(measures_dir, measure_dir_name) - root = get_measure_xml(os.path.join(measure_path, 'measure.xml')) - measure_class_name = root.find('./class_name').text + root = get_measure_xml(os.path.join(measure_path, "measure.xml")) + measure_class_name = root.find("./class_name").text # Don't include OpenStudioResults, it has too many registerValues for ComStock - if measure_class_name == 'OpenStudioResults': + if measure_class_name == "OpenStudioResults": continue measure_class_names.append(measure_class_name) @@ -82,86 +84,95 @@ def create_osw(self, sim_id, building_id, upgrade_idx): :param building_id: integer building id to use from the sampled buildstock.csv :param upgrade_idx: integer index of the upgrade scenario to apply, None if baseline """ - logger.debug('Generating OSW, sim_id={}'.format(sim_id)) + logger.debug("Generating OSW, sim_id={}".format(sim_id)) - workflow_args = { - 'measures': [] - } - workflow_args.update(self.cfg['workflow_generator'].get('args', {})) + workflow_args = {"measures": []} + workflow_args.update(self.cfg["workflow_generator"].get("args", {})) osw = { - 'id': sim_id, - 'steps': [ + "id": sim_id, + "steps": [ { "measure_dir_name": "BuildExistingModel", "arguments": { "number_of_buildings_represented": 1, - "building_id": int(building_id) + "building_id": int(building_id), }, - "measure_type": "ModelMeasure" + "measure_type": "ModelMeasure", } ], - 'created_at': dt.datetime.now().isoformat(), - 'measure_paths': [ - 'measures' - ], - 'weather_file': 'weather/empty.epw' + "created_at": dt.datetime.now().isoformat(), + "measure_paths": ["measures"], + "weather_file": "weather/empty.epw", } # Baseline measures (not typically used in ComStock) - osw['steps'].extend(workflow_args['measures']) + osw["steps"].extend(workflow_args["measures"]) # Upgrades if upgrade_idx is not None: - measure_d = self.cfg['upgrades'][upgrade_idx] + measure_d = self.cfg["upgrades"][upgrade_idx] apply_upgrade_measure = { - 'measure_dir_name': 'ApplyUpgrade', - 'arguments': { - 'run_measure': 1 - } + "measure_dir_name": "ApplyUpgrade", + "arguments": {"run_measure": 1}, } - if 'upgrade_name' in measure_d: - apply_upgrade_measure['arguments']['upgrade_name'] = measure_d['upgrade_name'] - for opt_num, option in enumerate(measure_d['options'], 1): - apply_upgrade_measure['arguments']['option_{}'.format(opt_num)] = option['option'] - if 'lifetime' in option: - apply_upgrade_measure['arguments']['option_{}_lifetime'.format(opt_num)] = option['lifetime'] - if 'apply_logic' in option: - apply_upgrade_measure['arguments']['option_{}_apply_logic'.format(opt_num)] = \ - self.make_apply_logic_arg(option['apply_logic']) - for cost_num, cost in enumerate(option.get('costs', []), 1): - for arg in ('value', 'multiplier'): + if "upgrade_name" in measure_d: + apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d[ + "upgrade_name" + ] + for opt_num, option in enumerate(measure_d["options"], 1): + apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = ( + option["option"] + ) + if "lifetime" in option: + apply_upgrade_measure["arguments"][ + "option_{}_lifetime".format(opt_num) + ] = option["lifetime"] + if "apply_logic" in option: + apply_upgrade_measure["arguments"][ + "option_{}_apply_logic".format(opt_num) + ] = self.make_apply_logic_arg(option["apply_logic"]) + for cost_num, cost in enumerate(option.get("costs", []), 1): + for arg in ("value", "multiplier"): if arg not in cost: continue - apply_upgrade_measure['arguments']['option_{}_cost_{}_{}'.format(opt_num, cost_num, arg)] = \ - cost[arg] - if 'package_apply_logic' in measure_d: - apply_upgrade_measure['arguments']['package_apply_logic'] = \ - self.make_apply_logic_arg(measure_d['package_apply_logic']) - - build_existing_model_idx = \ - list(map(lambda x: x['measure_dir_name'] == 'BuildExistingModel', osw['steps'])).index(True) - osw['steps'].insert(build_existing_model_idx + 1, apply_upgrade_measure) - - if 'timeseries_csv_export' in workflow_args: + apply_upgrade_measure["arguments"][ + "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) + ] = cost[arg] + if "package_apply_logic" in measure_d: + apply_upgrade_measure["arguments"]["package_apply_logic"] = ( + self.make_apply_logic_arg(measure_d["package_apply_logic"]) + ) + + build_existing_model_idx = list( + map( + lambda x: x["measure_dir_name"] == "BuildExistingModel", + osw["steps"], + ) + ).index(True) + osw["steps"].insert(build_existing_model_idx + 1, apply_upgrade_measure) + + if "timeseries_csv_export" in workflow_args: timeseries_csv_export_args = { - 'reporting_frequency': 'Timestep', - 'inc_output_variables': False + "reporting_frequency": "Timestep", + "inc_output_variables": False, } - timeseries_csv_export_args.update(workflow_args['timeseries_csv_export']) - timeseries_measure = [{ - 'measure_dir_name': 'TimeseriesCSVExport', - 'arguments': timeseries_csv_export_args, - "measure_type": "ReportingMeasure" - }] - osw['steps'].extend(timeseries_measure) + timeseries_csv_export_args.update(workflow_args["timeseries_csv_export"]) + timeseries_measure = [ + { + "measure_dir_name": "TimeseriesCSVExport", + "arguments": timeseries_csv_export_args, + "measure_type": "ReportingMeasure", + } + ] + osw["steps"].extend(timeseries_measure) # User-specified reporting measures - if 'reporting_measures' in workflow_args: - for reporting_measure in workflow_args['reporting_measures']: - if 'arguments' not in reporting_measure: - reporting_measure['arguments'] = {} - reporting_measure['measure_type'] = 'ReportingMeasure' - osw['steps'].append(reporting_measure) + if "reporting_measures" in workflow_args: + for reporting_measure in workflow_args["reporting_measures"]: + if "arguments" not in reporting_measure: + reporting_measure["arguments"] = {} + reporting_measure["measure_type"] = "ReportingMeasure" + osw["steps"].append(reporting_measure) return osw diff --git a/buildstockbatch/workflow_generator/residential_hpxml.py b/buildstockbatch/workflow_generator/residential_hpxml.py index 077ed0d1..4456be5b 100644 --- a/buildstockbatch/workflow_generator/residential_hpxml.py +++ b/buildstockbatch/workflow_generator/residential_hpxml.py @@ -35,8 +35,8 @@ def get_measure_arguments(xml_path): arguments = [] if os.path.isfile(xml_path): root = get_measure_xml(xml_path) - for argument in root.findall('./arguments/argument'): - name = argument.find('./name').text + for argument in root.findall("./arguments/argument"): + name = argument.find("./name").text arguments.append(name) return arguments @@ -143,33 +143,37 @@ def validate(cls, cfg): retain_schedules_csv: bool(required=False) debug: bool(required=False) """ # noqa E501 - workflow_generator_args = cfg['workflow_generator']['args'] - schema_yml = re.sub(r'^ {8}', '', schema_yml, flags=re.MULTILINE) - schema = yamale.make_schema(content=schema_yml, parser='ruamel') - data = yamale.make_data(content=json.dumps(workflow_generator_args), parser='ruamel') + workflow_generator_args = cfg["workflow_generator"]["args"] + schema_yml = re.sub(r"^ {8}", "", schema_yml, flags=re.MULTILINE) + schema = yamale.make_schema(content=schema_yml, parser="ruamel") + data = yamale.make_data( + content=json.dumps(workflow_generator_args), parser="ruamel" + ) yamale.validate(schema, data, strict=True) return cls.validate_measures_and_arguments(cfg) def reporting_measures(self): """Return a list of reporting measures to include in outputs""" - workflow_args = self.cfg['workflow_generator'].get('args', {}) - return [x['measure_dir_name'] for x in workflow_args.get('reporting_measures', [])] + workflow_args = self.cfg["workflow_generator"].get("args", {}) + return [ + x["measure_dir_name"] for x in workflow_args.get("reporting_measures", []) + ] @staticmethod def validate_measures_and_arguments(cfg): - buildstock_dir = cfg['buildstock_directory'] - measures_dir = os.path.join(buildstock_dir, 'measures') + buildstock_dir = cfg["buildstock_directory"] + measures_dir = os.path.join(buildstock_dir, "measures") measure_names = { - 'BuildExistingModel': 'baseline', - 'ApplyUpgrade': 'upgrades', + "BuildExistingModel": "baseline", + "ApplyUpgrade": "upgrades", } def cfg_path_exists(cfg_path): if cfg_path is None: return False - path_items = cfg_path.split('.') + path_items = cfg_path.split(".") a = cfg for path_item in path_items: try: @@ -181,7 +185,7 @@ def cfg_path_exists(cfg_path): def get_cfg_path(cfg_path): if cfg_path is None: return None - path_items = cfg_path.split('.') + path_items = cfg_path.split(".") a = cfg for path_item in path_items: try: @@ -190,13 +194,15 @@ def get_cfg_path(cfg_path): return None return a - workflow_args = cfg['workflow_generator'].get('args', {}) - if 'reporting_measures' in workflow_args.keys(): - for reporting_measure in workflow_args['reporting_measures']: - measure_names[reporting_measure['measure_dir_name']] = 'workflow_generator.args.reporting_measures' + workflow_args = cfg["workflow_generator"].get("args", {}) + if "reporting_measures" in workflow_args.keys(): + for reporting_measure in workflow_args["reporting_measures"]: + measure_names[reporting_measure["measure_dir_name"]] = ( + "workflow_generator.args.reporting_measures" + ) - error_msgs = '' - warning_msgs = '' + error_msgs = "" + warning_msgs = "" for measure_name, cfg_key in measure_names.items(): measure_path = os.path.join(measures_dir, measure_name) @@ -204,29 +210,31 @@ def get_cfg_path(cfg_path): if not cfg_path_exists(cfg_key): continue - if measure_name in ['ApplyUpgrade']: + if measure_name in ["ApplyUpgrade"]: # For ApplyUpgrade measure, verify that all the cost_multipliers used are correct - root = get_measure_xml(os.path.join(measure_path, 'measure.xml')) + root = get_measure_xml(os.path.join(measure_path, "measure.xml")) valid_multipliers = set() - for argument in root.findall('./arguments/argument'): - name = argument.find('./name') - if name.text.endswith('_multiplier'): - for choice in argument.findall('./choices/choice'): - value = choice.find('./value') - value = value.text if value is not None else '' + for argument in root.findall("./arguments/argument"): + name = argument.find("./name") + if name.text.endswith("_multiplier"): + for choice in argument.findall("./choices/choice"): + value = choice.find("./value") + value = value.text if value is not None else "" valid_multipliers.add(value) invalid_multipliers = Counter() - for upgrade_count, upgrade in enumerate(cfg['upgrades']): - for option_count, option in enumerate(upgrade['options']): - for cost_indx, cost_entry in enumerate(option.get('costs', [])): - if cost_entry['multiplier'] not in valid_multipliers: - invalid_multipliers[cost_entry['multiplier']] += 1 + for upgrade_count, upgrade in enumerate(cfg["upgrades"]): + for option_count, option in enumerate(upgrade["options"]): + for cost_indx, cost_entry in enumerate(option.get("costs", [])): + if cost_entry["multiplier"] not in valid_multipliers: + invalid_multipliers[cost_entry["multiplier"]] += 1 if invalid_multipliers: error_msgs += "* The following multipliers values are invalid: \n" for multiplier, count in invalid_multipliers.items(): error_msgs += f" '{multiplier}' - Used {count} times \n" - error_msgs += f" The list of valid multipliers are {valid_multipliers}.\n" + error_msgs += ( + f" The list of valid multipliers are {valid_multipliers}.\n" + ) if warning_msgs: logger.warning(warning_msgs) @@ -247,304 +255,330 @@ def create_osw(self, sim_id, building_id, upgrade_idx): """ # Default argument values workflow_args = { - 'build_existing_model': {}, - 'measures': [], - 'simulation_output_report': {}, - 'server_directory_cleanup': {} + "build_existing_model": {}, + "measures": [], + "simulation_output_report": {}, + "server_directory_cleanup": {}, } - workflow_args.update(self.cfg['workflow_generator'].get('args', {})) + workflow_args.update(self.cfg["workflow_generator"].get("args", {})) - logger.debug('Generating OSW, sim_id={}'.format(sim_id)) + logger.debug("Generating OSW, sim_id={}".format(sim_id)) sim_ctl_args = { - 'simulation_control_timestep': 60, - 'simulation_control_run_period_begin_month': 1, - 'simulation_control_run_period_begin_day_of_month': 1, - 'simulation_control_run_period_end_month': 12, - 'simulation_control_run_period_end_day_of_month': 31, - 'simulation_control_run_period_calendar_year': 2007, - 'add_component_loads': False + "simulation_control_timestep": 60, + "simulation_control_run_period_begin_month": 1, + "simulation_control_run_period_begin_day_of_month": 1, + "simulation_control_run_period_end_month": 12, + "simulation_control_run_period_end_day_of_month": 31, + "simulation_control_run_period_calendar_year": 2007, + "add_component_loads": False, } bld_exist_model_args = { - 'building_id': building_id, - 'sample_weight': self.cfg['baseline']['n_buildings_represented'] / self.n_datapoints + "building_id": building_id, + "sample_weight": self.cfg["baseline"]["n_buildings_represented"] + / self.n_datapoints, } bld_exist_model_args.update(sim_ctl_args) - bld_exist_model_args.update(workflow_args['build_existing_model']) + bld_exist_model_args.update(workflow_args["build_existing_model"]) add_component_loads = False - if 'add_component_loads' in bld_exist_model_args: - add_component_loads = bld_exist_model_args['add_component_loads'] - bld_exist_model_args.pop('add_component_loads') - - if 'emissions' in workflow_args: - emissions = workflow_args['emissions'] - emissions_map = [['emissions_scenario_names', 'scenario_name'], - ['emissions_types', 'type'], - ['emissions_electricity_folders', 'elec_folder'], - ['emissions_natural_gas_values', 'gas_value'], - ['emissions_propane_values', 'propane_value'], - ['emissions_fuel_oil_values', 'oil_value'], - ['emissions_wood_values', 'wood_value']] + if "add_component_loads" in bld_exist_model_args: + add_component_loads = bld_exist_model_args["add_component_loads"] + bld_exist_model_args.pop("add_component_loads") + + if "emissions" in workflow_args: + emissions = workflow_args["emissions"] + emissions_map = [ + ["emissions_scenario_names", "scenario_name"], + ["emissions_types", "type"], + ["emissions_electricity_folders", "elec_folder"], + ["emissions_natural_gas_values", "gas_value"], + ["emissions_propane_values", "propane_value"], + ["emissions_fuel_oil_values", "oil_value"], + ["emissions_wood_values", "wood_value"], + ] for arg, item in emissions_map: - bld_exist_model_args[arg] = ','.join([str(s.get(item, '')) for s in emissions]) - - buildstock_dir = self.cfg['buildstock_directory'] - measures_dir = os.path.join(buildstock_dir, 'measures') - measure_path = os.path.join(measures_dir, 'BuildExistingModel') - bld_exist_model_args_avail = get_measure_arguments(os.path.join(measure_path, 'measure.xml')) - - if 'utility_bills' in workflow_args: - utility_bills = workflow_args['utility_bills'] + bld_exist_model_args[arg] = ",".join( + [str(s.get(item, "")) for s in emissions] + ) + + buildstock_dir = self.cfg["buildstock_directory"] + measures_dir = os.path.join(buildstock_dir, "measures") + measure_path = os.path.join(measures_dir, "BuildExistingModel") + bld_exist_model_args_avail = get_measure_arguments( + os.path.join(measure_path, "measure.xml") + ) + + if "utility_bills" in workflow_args: + utility_bills = workflow_args["utility_bills"] utility_bills_map = [ - ['utility_bill_scenario_names', 'scenario_name'], - ['utility_bill_simple_filepaths', 'simple_filepath'], - ['utility_bill_detailed_filepaths', 'detailed_filepath'], - ['utility_bill_electricity_fixed_charges', 'elec_fixed_charge'], - ['utility_bill_electricity_marginal_rates', 'elec_marginal_rate'], - ['utility_bill_natural_gas_fixed_charges', 'gas_fixed_charge'], - ['utility_bill_natural_gas_marginal_rates', 'gas_marginal_rate'], - ['utility_bill_propane_fixed_charges', 'propane_fixed_charge'], - ['utility_bill_propane_marginal_rates', 'propane_marginal_rate'], - ['utility_bill_fuel_oil_fixed_charges', 'oil_fixed_charge'], - ['utility_bill_fuel_oil_marginal_rates', 'oil_marginal_rate'], - ['utility_bill_wood_fixed_charges', 'wood_fixed_charge'], - ['utility_bill_wood_marginal_rates', 'wood_marginal_rate'], - ['utility_bill_pv_compensation_types', 'pv_compensation_type'], - ['utility_bill_pv_net_metering_annual_excess_sellback_rate_types', - 'pv_net_metering_annual_excess_sellback_rate_type'], - ['utility_bill_pv_net_metering_annual_excess_sellback_rates', - 'pv_net_metering_annual_excess_sellback_rate'], - ['utility_bill_pv_feed_in_tariff_rates', 'pv_feed_in_tariff_rate'], - ['utility_bill_pv_monthly_grid_connection_fee_units', 'pv_monthly_grid_connection_fee_units'], - ['utility_bill_pv_monthly_grid_connection_fees', 'pv_monthly_grid_connection_fee'] + ["utility_bill_scenario_names", "scenario_name"], + ["utility_bill_simple_filepaths", "simple_filepath"], + ["utility_bill_detailed_filepaths", "detailed_filepath"], + ["utility_bill_electricity_fixed_charges", "elec_fixed_charge"], + ["utility_bill_electricity_marginal_rates", "elec_marginal_rate"], + ["utility_bill_natural_gas_fixed_charges", "gas_fixed_charge"], + ["utility_bill_natural_gas_marginal_rates", "gas_marginal_rate"], + ["utility_bill_propane_fixed_charges", "propane_fixed_charge"], + ["utility_bill_propane_marginal_rates", "propane_marginal_rate"], + ["utility_bill_fuel_oil_fixed_charges", "oil_fixed_charge"], + ["utility_bill_fuel_oil_marginal_rates", "oil_marginal_rate"], + ["utility_bill_wood_fixed_charges", "wood_fixed_charge"], + ["utility_bill_wood_marginal_rates", "wood_marginal_rate"], + ["utility_bill_pv_compensation_types", "pv_compensation_type"], + [ + "utility_bill_pv_net_metering_annual_excess_sellback_rate_types", + "pv_net_metering_annual_excess_sellback_rate_type", + ], + [ + "utility_bill_pv_net_metering_annual_excess_sellback_rates", + "pv_net_metering_annual_excess_sellback_rate", + ], + ["utility_bill_pv_feed_in_tariff_rates", "pv_feed_in_tariff_rate"], + [ + "utility_bill_pv_monthly_grid_connection_fee_units", + "pv_monthly_grid_connection_fee_units", + ], + [ + "utility_bill_pv_monthly_grid_connection_fees", + "pv_monthly_grid_connection_fee", + ], ] for arg, item in utility_bills_map: if arg in bld_exist_model_args_avail: - bld_exist_model_args[arg] = ','.join([str(s.get(item, '')) for s in utility_bills]) + bld_exist_model_args[arg] = ",".join( + [str(s.get(item, "")) for s in utility_bills] + ) sim_out_rep_args = { - 'timeseries_frequency': 'none', - 'include_timeseries_total_consumptions': False, - 'include_timeseries_fuel_consumptions': False, - 'include_timeseries_end_use_consumptions': True, - 'include_timeseries_emissions': False, - 'include_timeseries_emission_fuels': False, - 'include_timeseries_emission_end_uses': False, - 'include_timeseries_hot_water_uses': False, - 'include_timeseries_total_loads': True, - 'include_timeseries_component_loads': False, - 'include_timeseries_zone_temperatures': False, - 'include_timeseries_airflows': False, - 'include_timeseries_weather': False, - 'timeseries_timestamp_convention': 'end', - 'add_timeseries_dst_column': True, - 'add_timeseries_utc_column': True + "timeseries_frequency": "none", + "include_timeseries_total_consumptions": False, + "include_timeseries_fuel_consumptions": False, + "include_timeseries_end_use_consumptions": True, + "include_timeseries_emissions": False, + "include_timeseries_emission_fuels": False, + "include_timeseries_emission_end_uses": False, + "include_timeseries_hot_water_uses": False, + "include_timeseries_total_loads": True, + "include_timeseries_component_loads": False, + "include_timeseries_zone_temperatures": False, + "include_timeseries_airflows": False, + "include_timeseries_weather": False, + "timeseries_timestamp_convention": "end", + "add_timeseries_dst_column": True, + "add_timeseries_utc_column": True, } - measures_dir = os.path.join(buildstock_dir, 'resources/hpxml-measures') - measure_path = os.path.join(measures_dir, 'ReportSimulationOutput') - sim_out_rep_args_avail = get_measure_arguments(os.path.join(measure_path, 'measure.xml')) + measures_dir = os.path.join(buildstock_dir, "resources/hpxml-measures") + measure_path = os.path.join(measures_dir, "ReportSimulationOutput") + sim_out_rep_args_avail = get_measure_arguments( + os.path.join(measure_path, "measure.xml") + ) - if 'include_annual_total_consumptions' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_total_consumptions'] = True + if "include_annual_total_consumptions" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_total_consumptions"] = True - if 'include_annual_fuel_consumptions' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_fuel_consumptions'] = True + if "include_annual_fuel_consumptions" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_fuel_consumptions"] = True - if 'include_annual_end_use_consumptions' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_end_use_consumptions'] = True + if "include_annual_end_use_consumptions" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_end_use_consumptions"] = True - if 'include_annual_system_use_consumptions' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_system_use_consumptions'] = False + if "include_annual_system_use_consumptions" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_system_use_consumptions"] = False - if 'include_annual_emissions' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_emissions'] = True + if "include_annual_emissions" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_emissions"] = True - if 'include_annual_emission_fuels' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_emission_fuels'] = True + if "include_annual_emission_fuels" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_emission_fuels"] = True - if 'include_annual_emission_end_uses' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_emission_end_uses'] = True + if "include_annual_emission_end_uses" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_emission_end_uses"] = True - if 'include_annual_total_loads' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_total_loads'] = True + if "include_annual_total_loads" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_total_loads"] = True - if 'include_annual_unmet_hours' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_unmet_hours'] = True + if "include_annual_unmet_hours" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_unmet_hours"] = True - if 'include_annual_peak_fuels' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_peak_fuels'] = True + if "include_annual_peak_fuels" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_peak_fuels"] = True - if 'include_annual_peak_loads' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_peak_loads'] = True + if "include_annual_peak_loads" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_peak_loads"] = True - if 'include_annual_component_loads' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_component_loads'] = True + if "include_annual_component_loads" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_component_loads"] = True - if 'include_annual_hot_water_uses' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_hot_water_uses'] = True + if "include_annual_hot_water_uses" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_hot_water_uses"] = True - if 'include_annual_hvac_summary' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_hvac_summary'] = True + if "include_annual_hvac_summary" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_hvac_summary"] = True - if 'include_annual_resilience' in sim_out_rep_args_avail: - sim_out_rep_args['include_annual_resilience'] = True + if "include_annual_resilience" in sim_out_rep_args_avail: + sim_out_rep_args["include_annual_resilience"] = True - if 'include_timeseries_system_use_consumptions' in sim_out_rep_args_avail: - sim_out_rep_args['include_timeseries_system_use_consumptions'] = False + if "include_timeseries_system_use_consumptions" in sim_out_rep_args_avail: + sim_out_rep_args["include_timeseries_system_use_consumptions"] = False - if 'include_timeseries_unmet_hours' in sim_out_rep_args_avail: - sim_out_rep_args['include_timeseries_unmet_hours'] = False + if "include_timeseries_unmet_hours" in sim_out_rep_args_avail: + sim_out_rep_args["include_timeseries_unmet_hours"] = False - if 'include_timeseries_resilience' in sim_out_rep_args_avail: - sim_out_rep_args['include_timeseries_resilience'] = False + if "include_timeseries_resilience" in sim_out_rep_args_avail: + sim_out_rep_args["include_timeseries_resilience"] = False - if 'timeseries_num_decimal_places' in sim_out_rep_args_avail: - sim_out_rep_args['timeseries_num_decimal_places'] = 3 + if "timeseries_num_decimal_places" in sim_out_rep_args_avail: + sim_out_rep_args["timeseries_num_decimal_places"] = 3 - sim_out_rep_args.update(workflow_args['simulation_output_report']) + sim_out_rep_args.update(workflow_args["simulation_output_report"]) - if 'output_variables' in sim_out_rep_args: - output_variables = sim_out_rep_args['output_variables'] - sim_out_rep_args['user_output_variables'] = ','.join([str(s.get('name')) for s in output_variables]) - sim_out_rep_args.pop('output_variables') + if "output_variables" in sim_out_rep_args: + output_variables = sim_out_rep_args["output_variables"] + sim_out_rep_args["user_output_variables"] = ",".join( + [str(s.get("name")) for s in output_variables] + ) + sim_out_rep_args.pop("output_variables") util_bills_rep_args = {} - measures_dir = os.path.join(buildstock_dir, 'resources/hpxml-measures') - measure_path = os.path.join(measures_dir, 'ReportUtilityBills') - util_bills_rep_args_avail = get_measure_arguments(os.path.join(measure_path, 'measure.xml')) + measures_dir = os.path.join(buildstock_dir, "resources/hpxml-measures") + measure_path = os.path.join(measures_dir, "ReportUtilityBills") + util_bills_rep_args_avail = get_measure_arguments( + os.path.join(measure_path, "measure.xml") + ) - if 'include_annual_bills' in util_bills_rep_args_avail: - util_bills_rep_args['include_annual_bills'] = True + if "include_annual_bills" in util_bills_rep_args_avail: + util_bills_rep_args["include_annual_bills"] = True - if 'include_monthly_bills' in util_bills_rep_args_avail: - util_bills_rep_args['include_monthly_bills'] = False + if "include_monthly_bills" in util_bills_rep_args_avail: + util_bills_rep_args["include_monthly_bills"] = False osw = { - 'id': sim_id, - 'steps': [ + "id": sim_id, + "steps": [ { - 'measure_dir_name': 'BuildExistingModel', - 'arguments': bld_exist_model_args + "measure_dir_name": "BuildExistingModel", + "arguments": bld_exist_model_args, } ], - 'created_at': dt.datetime.now().isoformat(), - 'measure_paths': [ - 'measures', - 'resources/hpxml-measures' - ], - 'run_options': { - 'skip_zip_results': True - } + "created_at": dt.datetime.now().isoformat(), + "measure_paths": ["measures", "resources/hpxml-measures"], + "run_options": {"skip_zip_results": True}, } debug = False - if 'debug' in workflow_args: - debug = workflow_args['debug'] + if "debug" in workflow_args: + debug = workflow_args["debug"] server_dir_cleanup_args = { - 'retain_in_osm': False, - 'retain_in_idf': True, - 'retain_pre_process_idf': False, - 'retain_eplusout_audit': False, - 'retain_eplusout_bnd': False, - 'retain_eplusout_eio': False, - 'retain_eplusout_end': False, - 'retain_eplusout_err': False, - 'retain_eplusout_eso': False, - 'retain_eplusout_mdd': False, - 'retain_eplusout_mtd': False, - 'retain_eplusout_rdd': False, - 'retain_eplusout_shd': False, - 'retain_eplusout_msgpack': False, - 'retain_eplustbl_htm': False, - 'retain_stdout_energyplus': False, - 'retain_stdout_expandobject': False, - 'retain_schedules_csv': True, - 'debug': debug + "retain_in_osm": False, + "retain_in_idf": True, + "retain_pre_process_idf": False, + "retain_eplusout_audit": False, + "retain_eplusout_bnd": False, + "retain_eplusout_eio": False, + "retain_eplusout_end": False, + "retain_eplusout_err": False, + "retain_eplusout_eso": False, + "retain_eplusout_mdd": False, + "retain_eplusout_mtd": False, + "retain_eplusout_rdd": False, + "retain_eplusout_shd": False, + "retain_eplusout_msgpack": False, + "retain_eplustbl_htm": False, + "retain_stdout_energyplus": False, + "retain_stdout_expandobject": False, + "retain_schedules_csv": True, + "debug": debug, } - server_dir_cleanup_args.update(workflow_args['server_directory_cleanup']) - - osw['steps'].extend([ - { - 'measure_dir_name': 'HPXMLtoOpenStudio', - 'arguments': { - 'hpxml_path': '../../run/home.xml', - 'output_dir': '../../run', - 'debug': debug, - 'add_component_loads': add_component_loads, - 'skip_validation': True - } - } - ]) - - osw['steps'].extend(workflow_args['measures']) - - osw['steps'].extend([ - { - 'measure_dir_name': 'ReportSimulationOutput', - 'arguments': sim_out_rep_args - }, - { - 'measure_dir_name': 'ReportHPXMLOutput', - 'arguments': {} - }, - { - 'measure_dir_name': 'ReportUtilityBills', - 'arguments': util_bills_rep_args - }, - { - 'measure_dir_name': 'UpgradeCosts', - 'arguments': { - 'debug': debug + server_dir_cleanup_args.update(workflow_args["server_directory_cleanup"]) + + osw["steps"].extend( + [ + { + "measure_dir_name": "HPXMLtoOpenStudio", + "arguments": { + "hpxml_path": "../../run/home.xml", + "output_dir": "../../run", + "debug": debug, + "add_component_loads": add_component_loads, + "skip_validation": True, + }, } - }, - { - 'measure_dir_name': 'ServerDirectoryCleanup', - 'arguments': server_dir_cleanup_args - } - ]) + ] + ) + + osw["steps"].extend(workflow_args["measures"]) + + osw["steps"].extend( + [ + { + "measure_dir_name": "ReportSimulationOutput", + "arguments": sim_out_rep_args, + }, + {"measure_dir_name": "ReportHPXMLOutput", "arguments": {}}, + { + "measure_dir_name": "ReportUtilityBills", + "arguments": util_bills_rep_args, + }, + {"measure_dir_name": "UpgradeCosts", "arguments": {"debug": debug}}, + { + "measure_dir_name": "ServerDirectoryCleanup", + "arguments": server_dir_cleanup_args, + }, + ] + ) if upgrade_idx is not None: - measure_d = self.cfg['upgrades'][upgrade_idx] + measure_d = self.cfg["upgrades"][upgrade_idx] apply_upgrade_measure = { - 'measure_dir_name': 'ApplyUpgrade', - 'arguments': { - 'run_measure': 1 - } + "measure_dir_name": "ApplyUpgrade", + "arguments": {"run_measure": 1}, } - if 'upgrade_name' in measure_d: - apply_upgrade_measure['arguments']['upgrade_name'] = measure_d['upgrade_name'] - for opt_num, option in enumerate(measure_d['options'], 1): - apply_upgrade_measure['arguments']['option_{}'.format(opt_num)] = option['option'] - if 'lifetime' in option: - apply_upgrade_measure['arguments']['option_{}_lifetime'.format(opt_num)] = option['lifetime'] - if 'apply_logic' in option: - apply_upgrade_measure['arguments']['option_{}_apply_logic'.format(opt_num)] = \ - self.make_apply_logic_arg(option['apply_logic']) - for cost_num, cost in enumerate(option.get('costs', []), 1): - for arg in ('value', 'multiplier'): + if "upgrade_name" in measure_d: + apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d[ + "upgrade_name" + ] + for opt_num, option in enumerate(measure_d["options"], 1): + apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = ( + option["option"] + ) + if "lifetime" in option: + apply_upgrade_measure["arguments"][ + "option_{}_lifetime".format(opt_num) + ] = option["lifetime"] + if "apply_logic" in option: + apply_upgrade_measure["arguments"][ + "option_{}_apply_logic".format(opt_num) + ] = self.make_apply_logic_arg(option["apply_logic"]) + for cost_num, cost in enumerate(option.get("costs", []), 1): + for arg in ("value", "multiplier"): if arg not in cost: continue - apply_upgrade_measure['arguments']['option_{}_cost_{}_{}'.format(opt_num, cost_num, arg)] = \ - cost[arg] - if 'package_apply_logic' in measure_d: - apply_upgrade_measure['arguments']['package_apply_logic'] = \ - self.make_apply_logic_arg(measure_d['package_apply_logic']) - - build_existing_model_idx = \ - [x['measure_dir_name'] == 'BuildExistingModel' for x in osw['steps']].index(True) - osw['steps'].insert(build_existing_model_idx + 1, apply_upgrade_measure) - - if 'reporting_measures' in workflow_args: - for reporting_measure in workflow_args['reporting_measures']: - if 'arguments' not in reporting_measure: - reporting_measure['arguments'] = {} - reporting_measure['measure_type'] = 'ReportingMeasure' - osw['steps'].insert(-1, reporting_measure) # right before ServerDirectoryCleanup + apply_upgrade_measure["arguments"][ + "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) + ] = cost[arg] + if "package_apply_logic" in measure_d: + apply_upgrade_measure["arguments"]["package_apply_logic"] = ( + self.make_apply_logic_arg(measure_d["package_apply_logic"]) + ) + + build_existing_model_idx = [ + x["measure_dir_name"] == "BuildExistingModel" for x in osw["steps"] + ].index(True) + osw["steps"].insert(build_existing_model_idx + 1, apply_upgrade_measure) + + if "reporting_measures" in workflow_args: + for reporting_measure in workflow_args["reporting_measures"]: + if "arguments" not in reporting_measure: + reporting_measure["arguments"] = {} + reporting_measure["measure_type"] = "ReportingMeasure" + osw["steps"].insert( + -1, reporting_measure + ) # right before ServerDirectoryCleanup return osw diff --git a/buildstockbatch/workflow_generator/test_workflow_generator.py b/buildstockbatch/workflow_generator/test_workflow_generator.py index a6171e9b..113c7965 100644 --- a/buildstockbatch/workflow_generator/test_workflow_generator.py +++ b/buildstockbatch/workflow_generator/test_workflow_generator.py @@ -1,265 +1,326 @@ from buildstockbatch.workflow_generator.base import WorkflowGeneratorBase -from buildstockbatch.workflow_generator.residential_hpxml import ResidentialHpxmlWorkflowGenerator -from buildstockbatch.workflow_generator.commercial import CommercialDefaultWorkflowGenerator +from buildstockbatch.workflow_generator.residential_hpxml import ( + ResidentialHpxmlWorkflowGenerator, +) +from buildstockbatch.workflow_generator.commercial import ( + CommercialDefaultWorkflowGenerator, +) from buildstockbatch.test.shared_testing_stuff import resstock_directory def test_apply_logic_recursion(): - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg(['one', 'two', 'three']) - assert apply_logic == '(one&&two&&three)' + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg(["one", "two", "three"]) + assert apply_logic == "(one&&two&&three)" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({ - 'and': ['one', 'two', 'three'] - }) - assert apply_logic == '(one&&two&&three)' + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg( + {"and": ["one", "two", "three"]} + ) + assert apply_logic == "(one&&two&&three)" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({ - 'or': ['four', 'five', 'six'] - }) - assert apply_logic == '(four||five||six)' + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg( + {"or": ["four", "five", "six"]} + ) + assert apply_logic == "(four||five||six)" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({ - 'not': 'seven' - }) - assert apply_logic == '!seven' + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"not": "seven"}) + assert apply_logic == "!seven" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({ - 'and': [ - {'not': 'abc'}, - {'or': [ - 'def', - 'ghi' - ]}, - 'jkl', - 'mno' - ] - }) - assert apply_logic == '(!abc&&(def||ghi)&&jkl&&mno)' + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg( + {"and": [{"not": "abc"}, {"or": ["def", "ghi"]}, "jkl", "mno"]} + ) + assert apply_logic == "(!abc&&(def||ghi)&&jkl&&mno)" def test_residential_hpxml(mocker): - sim_id = 'bldb1up1' + sim_id = "bldb1up1" building_id = 1 upgrade_idx = 0 cfg = { - 'buildstock_directory': resstock_directory, - 'baseline': { - 'n_buildings_represented': 100 - }, - 'workflow_generator': { - 'type': 'residential_hpxml', - 'args': { - 'build_existing_model': { - 'simulation_control_run_period_begin_month': 2, - 'simulation_control_run_period_begin_day_of_month': 1, - 'simulation_control_run_period_end_month': 2, - 'simulation_control_run_period_end_day_of_month': 28, - 'simulation_control_run_period_calendar_year': 2010, + "buildstock_directory": resstock_directory, + "baseline": {"n_buildings_represented": 100}, + "workflow_generator": { + "type": "residential_hpxml", + "args": { + "build_existing_model": { + "simulation_control_run_period_begin_month": 2, + "simulation_control_run_period_begin_day_of_month": 1, + "simulation_control_run_period_end_month": 2, + "simulation_control_run_period_end_day_of_month": 28, + "simulation_control_run_period_calendar_year": 2010, }, - 'simulation_output_report': { - 'timeseries_frequency': 'hourly', - 'include_timeseries_total_consumptions': True, - 'include_timeseries_end_use_consumptions': True, - 'include_timeseries_total_loads': True, - 'include_timeseries_zone_temperatures': False, - } - } + "simulation_output_report": { + "timeseries_frequency": "hourly", + "include_timeseries_total_consumptions": True, + "include_timeseries_end_use_consumptions": True, + "include_timeseries_total_loads": True, + "include_timeseries_zone_temperatures": False, + }, + }, }, - 'upgrades': [ + "upgrades": [ { - 'options': [ + "options": [ { - 'option': 'Parameter|Option', + "option": "Parameter|Option", } ], } - ] + ], } n_datapoints = 10 osw_gen = ResidentialHpxmlWorkflowGenerator(cfg, n_datapoints) osw = osw_gen.create_osw(sim_id, building_id, upgrade_idx) - steps = osw['steps'] + steps = osw["steps"] assert len(steps) == 8 build_existing_model_step = steps[0] - assert build_existing_model_step['measure_dir_name'] == 'BuildExistingModel' - assert build_existing_model_step['arguments']['simulation_control_run_period_begin_month'] == 2 - assert build_existing_model_step['arguments']['simulation_control_run_period_begin_day_of_month'] == 1 - assert build_existing_model_step['arguments']['simulation_control_run_period_end_month'] == 2 - assert build_existing_model_step['arguments']['simulation_control_run_period_end_day_of_month'] == 28 - assert build_existing_model_step['arguments']['simulation_control_run_period_calendar_year'] == 2010 + assert build_existing_model_step["measure_dir_name"] == "BuildExistingModel" + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_begin_month" + ] + == 2 + ) + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_begin_day_of_month" + ] + == 1 + ) + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_end_month" + ] + == 2 + ) + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_end_day_of_month" + ] + == 28 + ) + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_calendar_year" + ] + == 2010 + ) apply_upgrade_step = steps[1] - assert apply_upgrade_step['measure_dir_name'] == 'ApplyUpgrade' + assert apply_upgrade_step["measure_dir_name"] == "ApplyUpgrade" hpxml_to_os_step = steps[2] - assert hpxml_to_os_step['measure_dir_name'] == 'HPXMLtoOpenStudio' + assert hpxml_to_os_step["measure_dir_name"] == "HPXMLtoOpenStudio" simulation_output_step = steps[3] - assert simulation_output_step['measure_dir_name'] == 'ReportSimulationOutput' - assert simulation_output_step['arguments']['timeseries_frequency'] == 'hourly' - assert simulation_output_step['arguments']['include_annual_total_consumptions'] is True - assert simulation_output_step['arguments']['include_annual_fuel_consumptions'] is True - assert simulation_output_step['arguments']['include_annual_end_use_consumptions'] is True - assert simulation_output_step['arguments']['include_annual_system_use_consumptions'] is False - assert simulation_output_step['arguments']['include_annual_emissions'] is True - assert simulation_output_step['arguments']['include_annual_emission_fuels'] is True - assert simulation_output_step['arguments']['include_annual_emission_end_uses'] is True - assert simulation_output_step['arguments']['include_annual_total_loads'] is True - assert simulation_output_step['arguments']['include_annual_unmet_hours'] is True - assert simulation_output_step['arguments']['include_annual_peak_fuels'] is True - assert simulation_output_step['arguments']['include_annual_peak_loads'] is True - assert simulation_output_step['arguments']['include_annual_component_loads'] is True - assert simulation_output_step['arguments']['include_annual_hot_water_uses'] is True - assert simulation_output_step['arguments']['include_annual_hvac_summary'] is True - assert simulation_output_step['arguments']['include_annual_resilience'] is True - assert simulation_output_step['arguments']['include_timeseries_total_consumptions'] is True - assert simulation_output_step['arguments']['include_timeseries_fuel_consumptions'] is False - assert simulation_output_step['arguments']['include_timeseries_end_use_consumptions'] is True - assert simulation_output_step['arguments']['include_timeseries_system_use_consumptions'] is False - assert simulation_output_step['arguments']['include_timeseries_emissions'] is False - assert simulation_output_step['arguments']['include_timeseries_emission_fuels'] is False - assert simulation_output_step['arguments']['include_timeseries_emission_end_uses'] is False - assert simulation_output_step['arguments']['include_timeseries_hot_water_uses'] is False - assert simulation_output_step['arguments']['include_timeseries_total_loads'] is True - assert simulation_output_step['arguments']['include_timeseries_component_loads'] is False - assert simulation_output_step['arguments']['include_timeseries_unmet_hours'] is False - assert simulation_output_step['arguments']['include_timeseries_zone_temperatures'] is False - assert simulation_output_step['arguments']['include_timeseries_airflows'] is False - assert simulation_output_step['arguments']['include_timeseries_weather'] is False - assert simulation_output_step['arguments']['include_timeseries_resilience'] is False - assert simulation_output_step['arguments']['timeseries_timestamp_convention'] == 'end' - assert simulation_output_step['arguments']['timeseries_num_decimal_places'] == 3 - assert simulation_output_step['arguments']['add_timeseries_dst_column'] is True - assert simulation_output_step['arguments']['add_timeseries_utc_column'] is True + assert simulation_output_step["measure_dir_name"] == "ReportSimulationOutput" + assert simulation_output_step["arguments"]["timeseries_frequency"] == "hourly" + assert ( + simulation_output_step["arguments"]["include_annual_total_consumptions"] is True + ) + assert ( + simulation_output_step["arguments"]["include_annual_fuel_consumptions"] is True + ) + assert ( + simulation_output_step["arguments"]["include_annual_end_use_consumptions"] + is True + ) + assert ( + simulation_output_step["arguments"]["include_annual_system_use_consumptions"] + is False + ) + assert simulation_output_step["arguments"]["include_annual_emissions"] is True + assert simulation_output_step["arguments"]["include_annual_emission_fuels"] is True + assert ( + simulation_output_step["arguments"]["include_annual_emission_end_uses"] is True + ) + assert simulation_output_step["arguments"]["include_annual_total_loads"] is True + assert simulation_output_step["arguments"]["include_annual_unmet_hours"] is True + assert simulation_output_step["arguments"]["include_annual_peak_fuels"] is True + assert simulation_output_step["arguments"]["include_annual_peak_loads"] is True + assert simulation_output_step["arguments"]["include_annual_component_loads"] is True + assert simulation_output_step["arguments"]["include_annual_hot_water_uses"] is True + assert simulation_output_step["arguments"]["include_annual_hvac_summary"] is True + assert simulation_output_step["arguments"]["include_annual_resilience"] is True + assert ( + simulation_output_step["arguments"]["include_timeseries_total_consumptions"] + is True + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_fuel_consumptions"] + is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_end_use_consumptions"] + is True + ) + assert ( + simulation_output_step["arguments"][ + "include_timeseries_system_use_consumptions" + ] + is False + ) + assert simulation_output_step["arguments"]["include_timeseries_emissions"] is False + assert ( + simulation_output_step["arguments"]["include_timeseries_emission_fuels"] + is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_emission_end_uses"] + is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_hot_water_uses"] + is False + ) + assert simulation_output_step["arguments"]["include_timeseries_total_loads"] is True + assert ( + simulation_output_step["arguments"]["include_timeseries_component_loads"] + is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_unmet_hours"] is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_zone_temperatures"] + is False + ) + assert simulation_output_step["arguments"]["include_timeseries_airflows"] is False + assert simulation_output_step["arguments"]["include_timeseries_weather"] is False + assert simulation_output_step["arguments"]["include_timeseries_resilience"] is False + assert ( + simulation_output_step["arguments"]["timeseries_timestamp_convention"] == "end" + ) + assert simulation_output_step["arguments"]["timeseries_num_decimal_places"] == 3 + assert simulation_output_step["arguments"]["add_timeseries_dst_column"] is True + assert simulation_output_step["arguments"]["add_timeseries_utc_column"] is True hpxml_output_step = steps[4] - assert hpxml_output_step['measure_dir_name'] == 'ReportHPXMLOutput' + assert hpxml_output_step["measure_dir_name"] == "ReportHPXMLOutput" utility_bills_step = steps[5] - assert utility_bills_step['measure_dir_name'] == 'ReportUtilityBills' - assert utility_bills_step['arguments']['include_annual_bills'] is True - assert utility_bills_step['arguments']['include_monthly_bills'] is False + assert utility_bills_step["measure_dir_name"] == "ReportUtilityBills" + assert utility_bills_step["arguments"]["include_annual_bills"] is True + assert utility_bills_step["arguments"]["include_monthly_bills"] is False upgrade_costs_step = steps[6] - assert upgrade_costs_step['measure_dir_name'] == 'UpgradeCosts' + assert upgrade_costs_step["measure_dir_name"] == "UpgradeCosts" server_dir_cleanup_step = steps[7] - assert server_dir_cleanup_step['measure_dir_name'] == 'ServerDirectoryCleanup' + assert server_dir_cleanup_step["measure_dir_name"] == "ServerDirectoryCleanup" def test_com_default_workflow_generator_basic(mocker): - sim_id = 'bldb1up1' + sim_id = "bldb1up1" building_id = 1 upgrade_idx = None cfg = { - 'baseline': { - 'n_buildings_represented': 100 - }, - 'workflow_generator': { - 'type': 'commercial_default', - 'args': { - } - } + "baseline": {"n_buildings_represented": 100}, + "workflow_generator": {"type": "commercial_default", "args": {}}, } CommercialDefaultWorkflowGenerator.validate(cfg) osw_gen = CommercialDefaultWorkflowGenerator(cfg, 10) osw = osw_gen.create_osw(sim_id, building_id, upgrade_idx) # Should always get BuildExistingModel - reporting_measure_step = osw['steps'][0] - assert reporting_measure_step['measure_dir_name'] == 'BuildExistingModel' - assert reporting_measure_step['arguments']['number_of_buildings_represented'] == 1 - assert reporting_measure_step['measure_type'] == 'ModelMeasure' + reporting_measure_step = osw["steps"][0] + assert reporting_measure_step["measure_dir_name"] == "BuildExistingModel" + assert reporting_measure_step["arguments"]["number_of_buildings_represented"] == 1 + assert reporting_measure_step["measure_type"] == "ModelMeasure" # Should not get TimeseriesCSVExport if excluded in args - assert len(osw['steps']) == 1 + assert len(osw["steps"]) == 1 def test_com_default_workflow_generator_with_timeseries(mocker): - sim_id = 'bldb1up1' + sim_id = "bldb1up1" building_id = 1 upgrade_idx = None cfg = { - 'baseline': { - 'n_buildings_represented': 100 - }, - 'workflow_generator': { - 'type': 'commercial_default', - 'args': { - 'timeseries_csv_export': { - 'reporting_frequency': 'Hourly', - 'inc_output_variables': 'true' + "baseline": {"n_buildings_represented": 100}, + "workflow_generator": { + "type": "commercial_default", + "args": { + "timeseries_csv_export": { + "reporting_frequency": "Hourly", + "inc_output_variables": "true", } - } - } + }, + }, } CommercialDefaultWorkflowGenerator.validate(cfg) osw_gen = CommercialDefaultWorkflowGenerator(cfg, 10) osw = osw_gen.create_osw(sim_id, building_id, upgrade_idx) # Should always get BuildExistingModel - reporting_measure_step = osw['steps'][0] - assert reporting_measure_step['measure_dir_name'] == 'BuildExistingModel' - assert reporting_measure_step['arguments']['number_of_buildings_represented'] == 1 - assert reporting_measure_step['measure_type'] == 'ModelMeasure' + reporting_measure_step = osw["steps"][0] + assert reporting_measure_step["measure_dir_name"] == "BuildExistingModel" + assert reporting_measure_step["arguments"]["number_of_buildings_represented"] == 1 + assert reporting_measure_step["measure_type"] == "ModelMeasure" # Should get TimeseriesCSVExport if included in args - reporting_measure_step = osw['steps'][1] - assert reporting_measure_step['measure_dir_name'] == 'TimeseriesCSVExport' - assert reporting_measure_step['measure_type'] == 'ReportingMeasure' - assert reporting_measure_step['arguments']['reporting_frequency'] == 'Hourly' - assert reporting_measure_step['arguments']['inc_output_variables'] == 'true' + reporting_measure_step = osw["steps"][1] + assert reporting_measure_step["measure_dir_name"] == "TimeseriesCSVExport" + assert reporting_measure_step["measure_type"] == "ReportingMeasure" + assert reporting_measure_step["arguments"]["reporting_frequency"] == "Hourly" + assert reporting_measure_step["arguments"]["inc_output_variables"] == "true" def test_com_default_workflow_generator_extended(mocker): - sim_id = 'bldb1up1' + sim_id = "bldb1up1" building_id = 1 upgrade_idx = None cfg = { - 'baseline': { - 'n_buildings_represented': 100 - }, - 'workflow_generator': { - 'type': 'commercial_default', - 'args': { - 'reporting_measures': [ - {'measure_dir_name': 'f8e23017-894d-4bdf-977f-37e3961e6f42', 'arguments': { - 'building_summary_section': 'true', - 'annual_overview_section': 'true', - 'monthly_overview_section': 'true', - 'utility_bills_rates_section': 'true', - 'envelope_section_section': 'true', - 'space_type_breakdown_section': 'true', - 'space_type_details_section': 'true', - 'interior_lighting_section': 'true', - 'plug_loads_section': 'true', - 'exterior_light_section': 'true', - 'water_use_section': 'true', - 'hvac_load_profile': 'true', - 'zone_condition_section': 'true', - 'zone_summary_section': 'true', - 'zone_equipment_detail_section': 'true', - 'air_loops_detail_section': 'true', - 'plant_loops_detail_section': 'true', - 'outdoor_air_section': 'true', - 'cost_summary_section': 'true', - 'source_energy_section': 'true', - 'schedules_overview_section': 'true' - }}, - {'measure_dir_name': 'SimulationOutputReport'}, - {'measure_dir_name': 'comstock_sensitivity_reports'}, - {'measure_dir_name': 'qoi_report'}, - {'measure_dir_name': 'la_100_qaqc', 'arguments': {'run_qaqc': 'true'}}, - {'measure_dir_name': 'simulation_settings_check', 'arguments': {'run_sim_settings_checks': 'true'}}, - {'measure_dir_name': 'run_directory_cleanup'}, + "baseline": {"n_buildings_represented": 100}, + "workflow_generator": { + "type": "commercial_default", + "args": { + "reporting_measures": [ + { + "measure_dir_name": "f8e23017-894d-4bdf-977f-37e3961e6f42", + "arguments": { + "building_summary_section": "true", + "annual_overview_section": "true", + "monthly_overview_section": "true", + "utility_bills_rates_section": "true", + "envelope_section_section": "true", + "space_type_breakdown_section": "true", + "space_type_details_section": "true", + "interior_lighting_section": "true", + "plug_loads_section": "true", + "exterior_light_section": "true", + "water_use_section": "true", + "hvac_load_profile": "true", + "zone_condition_section": "true", + "zone_summary_section": "true", + "zone_equipment_detail_section": "true", + "air_loops_detail_section": "true", + "plant_loops_detail_section": "true", + "outdoor_air_section": "true", + "cost_summary_section": "true", + "source_energy_section": "true", + "schedules_overview_section": "true", + }, + }, + {"measure_dir_name": "SimulationOutputReport"}, + {"measure_dir_name": "comstock_sensitivity_reports"}, + {"measure_dir_name": "qoi_report"}, + { + "measure_dir_name": "la_100_qaqc", + "arguments": {"run_qaqc": "true"}, + }, + { + "measure_dir_name": "simulation_settings_check", + "arguments": {"run_sim_settings_checks": "true"}, + }, + {"measure_dir_name": "run_directory_cleanup"}, ], - 'timeseries_csv_export': { - 'reporting_frequency': 'Hourly', - 'inc_output_variables': 'true' - } - } - } + "timeseries_csv_export": { + "reporting_frequency": "Hourly", + "inc_output_variables": "true", + }, + }, + }, } CommercialDefaultWorkflowGenerator.validate(cfg) @@ -268,23 +329,28 @@ def test_com_default_workflow_generator_extended(mocker): osw = osw_gen.create_osw(sim_id, building_id, upgrade_idx) # Should always get SimulationOutputReport - reporting_measure_step = osw['steps'][3] - assert reporting_measure_step['measure_dir_name'] == 'SimulationOutputReport' - assert reporting_measure_step['measure_type'] == 'ReportingMeasure' - assert reporting_measure_step['arguments'] == {} + reporting_measure_step = osw["steps"][3] + assert reporting_measure_step["measure_dir_name"] == "SimulationOutputReport" + assert reporting_measure_step["measure_type"] == "ReportingMeasure" + assert reporting_measure_step["arguments"] == {} # Should only be one instance of SimulationOutputReport - assert [d['measure_dir_name'] == 'SimulationOutputReport' for d in osw['steps']].count(True) == 1 + assert [ + d["measure_dir_name"] == "SimulationOutputReport" for d in osw["steps"] + ].count(True) == 1 # Should get TimeseriesCSVExport if included in args - reporting_measure_step = osw['steps'][1] - assert reporting_measure_step['measure_dir_name'] == 'TimeseriesCSVExport' - assert reporting_measure_step['measure_type'] == 'ReportingMeasure' - assert reporting_measure_step['arguments']['reporting_frequency'] == 'Hourly' - assert reporting_measure_step['arguments']['inc_output_variables'] == 'true' + reporting_measure_step = osw["steps"][1] + assert reporting_measure_step["measure_dir_name"] == "TimeseriesCSVExport" + assert reporting_measure_step["measure_type"] == "ReportingMeasure" + assert reporting_measure_step["arguments"]["reporting_frequency"] == "Hourly" + assert reporting_measure_step["arguments"]["inc_output_variables"] == "true" # Should have the openstudio report - reporting_measure_step = osw['steps'][2] - assert reporting_measure_step['measure_dir_name'] == 'f8e23017-894d-4bdf-977f-37e3961e6f42' - assert reporting_measure_step['measure_type'] == 'ReportingMeasure' - assert reporting_measure_step['arguments']['building_summary_section'] == 'true' - assert reporting_measure_step['arguments']['schedules_overview_section'] == 'true' + reporting_measure_step = osw["steps"][2] + assert ( + reporting_measure_step["measure_dir_name"] + == "f8e23017-894d-4bdf-977f-37e3961e6f42" + ) + assert reporting_measure_step["measure_type"] == "ReportingMeasure" + assert reporting_measure_step["arguments"]["building_summary_section"] == "true" + assert reporting_measure_step["arguments"]["schedules_overview_section"] == "true" # Should have 1 workflow measure plus 9 reporting measures - assert len(osw['steps']) == 9 + assert len(osw["steps"]) == 9 diff --git a/docs/conf.py b/docs/conf.py index ceb6b474..45c44c52 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,19 +20,21 @@ here = os.path.abspath(os.path.dirname(__file__)) metadata = {} -with open(os.path.join(here, '..', 'buildstockbatch', '__version__.py'), 'r', encoding='utf-8') as f: +with open( + os.path.join(here, "..", "buildstockbatch", "__version__.py"), "r", encoding="utf-8" +) as f: exec(f.read(), metadata) # -- Project information ----------------------------------------------------- -project = metadata['__title__'] -copyright = metadata['__copyright__'] -author = metadata['__author__'] +project = metadata["__title__"] +copyright = metadata["__copyright__"] +author = metadata["__author__"] # The short X.Y version -version = metadata['__version__'] +version = metadata["__version__"] # The full version, including alpha/beta/rc tags -release = metadata['__version__'] +release = metadata["__version__"] # -- General configuration --------------------------------------------------- @@ -45,12 +47,12 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.todo', - 'sphinx.ext.mathjax', - 'sphinxcontrib.programoutput', - 'changelog', - 'sphinx_paramlinks' + "sphinx.ext.autodoc", + "sphinx.ext.todo", + "sphinx.ext.mathjax", + "sphinxcontrib.programoutput", + "changelog", + "sphinx_paramlinks", ] changelog_sections = [ @@ -64,16 +66,10 @@ "local", "aws", "postprocessing", - "documentation" + "documentation", ] # tags to sort on inside of sections -changelog_inner_tag_sort = [ - "feature", - "changed", - "removed", - "bug", - "moved" -] +changelog_inner_tag_sort = ["feature", "changed", "removed", "bug", "moved"] # how to render changelog links @@ -84,28 +80,28 @@ } # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = 'en' +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None @@ -116,8 +112,10 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' -html_theme_path = ["_themes", ] +html_theme = "sphinx_rtd_theme" +html_theme_path = [ + "_themes", +] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -144,7 +142,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'BuildStockBatchdoc' +htmlhelp_basename = "BuildStockBatchdoc" # -- Options for LaTeX output ------------------------------------------------ @@ -153,15 +151,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -171,8 +166,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'BuildStockBatch.tex', 'BuildStock Batch Documentation', - 'Noel Merket (NREL)', 'manual'), + ( + master_doc, + "BuildStockBatch.tex", + "BuildStock Batch Documentation", + "Noel Merket (NREL)", + "manual", + ), ] @@ -181,8 +181,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'buildstockbatch', 'BuildStock Batch Documentation', - [author], 1) + (master_doc, "buildstockbatch", "BuildStock Batch Documentation", [author], 1) ] @@ -192,9 +191,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'BuildStockBatch', 'BuildStock Batch Documentation', - author, 'BuildStockBatch', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "BuildStockBatch", + "BuildStock Batch Documentation", + author, + "BuildStockBatch", + "One line description of project.", + "Miscellaneous", + ), ] @@ -213,7 +218,7 @@ # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # -- Extension configuration ------------------------------------------------- @@ -227,14 +232,14 @@ # Ignore reference targets not found nitpick_ignore = [ - ('py:func', 'BuildStockBatchBase.validate_precomputed_sample'), - ('py:func', 'BuildStockBatchBase.validate_xor_nor_schema_keys'), - ('py:func', 'EagleBatch.run_building'), - ('py:class', 'sampler.CommercialSobolSingularitySampler'), - ('py:class', 'sampler.CommercialSobolDockerSampler'), - ('py:class', 'workflow_generator.CommercialDefaultWorkflowGenerator'), - ('py:class', 'sampler.PrecomputedSampler'), - ('py:class', 'sampler.BuildStockSampler'), - ('py:class', 'BuildStockBatchBase'), - ('py:func', 'BuildStockBatchBase.run_sampling') + ("py:func", "BuildStockBatchBase.validate_precomputed_sample"), + ("py:func", "BuildStockBatchBase.validate_xor_nor_schema_keys"), + ("py:func", "EagleBatch.run_building"), + ("py:class", "sampler.CommercialSobolSingularitySampler"), + ("py:class", "sampler.CommercialSobolDockerSampler"), + ("py:class", "workflow_generator.CommercialDefaultWorkflowGenerator"), + ("py:class", "sampler.PrecomputedSampler"), + ("py:class", "sampler.BuildStockSampler"), + ("py:class", "BuildStockBatchBase"), + ("py:func", "BuildStockBatchBase.run_sampling"), ] diff --git a/setup.py b/setup.py index 1cd2c230..5e45e2f3 100644 --- a/setup.py +++ b/setup.py @@ -8,81 +8,80 @@ here = os.path.abspath(os.path.dirname(__file__)) metadata = {} -with open(os.path.join(here, 'buildstockbatch', '__version__.py'), 'r', encoding='utf-8') as f: +with open( + os.path.join(here, "buildstockbatch", "__version__.py"), "r", encoding="utf-8" +) as f: exec(f.read(), metadata) -with open('README.md', 'r', 'utf-8') as f: +with open("README.md", "r", "utf-8") as f: readme = f.read() setuptools.setup( - name=metadata['__title__'], - version=metadata['__version__'], - author=metadata['__author__'], - author_email=metadata['__author_email__'], - description=metadata['__description__'], + name=metadata["__title__"], + version=metadata["__version__"], + author=metadata["__author__"], + author_email=metadata["__author_email__"], + description=metadata["__description__"], long_description=readme, - long_description_content_type='text/markdown', - url=metadata['__url__'], + long_description_content_type="text/markdown", + url=metadata["__url__"], packages=setuptools.find_packages(), - python_requires='>=3.8', - package_data={ - 'buildstockbatch': ['*.sh', 'schemas/*.yaml'], - '': ['LICENSE'] - }, + python_requires=">=3.8", + package_data={"buildstockbatch": ["*.sh", "schemas/*.yaml"], "": ["LICENSE"]}, install_requires=[ - 'pyyaml', - 'requests', - 'numpy', - 'pandas>=2', - 'joblib', - 'pyarrow', - 'dask[complete]>=2022.10.0', - 'docker', - 's3fs[boto3]', - 'fsspec', - 'yamale', - 'ruamel.yaml', - 'lxml', - 'semver', - 'tqdm', + "pyyaml", + "requests", + "numpy", + "pandas>=2", + "joblib", + "pyarrow", + "dask[complete]>=2022.10.0", + "docker", + "s3fs[boto3]", + "fsspec", + "yamale", + "ruamel.yaml", + "lxml", + "semver", + "tqdm", ], extras_require={ - 'dev': [ - 'pytest', - 'pytest-mock', - 'pytest-cov', - 'testfixtures', - 'Sphinx', - 'sphinx_rtd_theme>=1.1.0', - 'sphinx-autobuild', - 'sphinxcontrib-programoutput', - 'sphinx_paramlinks', - 'changelog', - 'flake8', - 'rope', - 'doc8' + "dev": [ + "pytest", + "pytest-mock", + "pytest-cov", + "testfixtures", + "Sphinx", + "sphinx_rtd_theme>=1.1.0", + "sphinx-autobuild", + "sphinxcontrib-programoutput", + "sphinx_paramlinks", + "changelog", + "flake8", + "rope", + "doc8", + ], + "aws": [ + "dask-cloudprovider[aws]", ], - 'aws': [ - 'dask-cloudprovider[aws]', - ] }, entry_points={ - 'console_scripts': [ - 'buildstock_local=buildstockbatch.local:main', - 'buildstock_eagle=buildstockbatch.eagle:user_cli', - 'buildstock_aws=buildstockbatch.aws.aws:main' + "console_scripts": [ + "buildstock_local=buildstockbatch.local:main", + "buildstock_eagle=buildstockbatch.eagle:user_cli", + "buildstock_aws=buildstockbatch.aws.aws:main", ] }, - license='BSD-3', + license="BSD-3", classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: BSD License', - 'Natural Language :: English', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11' - ] + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], ) From 5c60c71d4fc72412d521160252d6556e1558c0b9 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 31 Oct 2023 09:18:05 -0600 Subject: [PATCH 33/53] updating line length for black --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..55ec8d78 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.black] +line-length = 120 From 3ce824991198942dfcbdb18825f0d17182b0f9ee Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 29 Jan 2024 16:44:20 +0000 Subject: [PATCH 34/53] reformatting with black and a longer line length --- buildstockbatch/__version__.py | 4 +- buildstockbatch/aws/aws.py | 308 +++++------------- buildstockbatch/aws/awsbase.py | 38 +-- buildstockbatch/base.py | 264 ++++----------- buildstockbatch/eagle.py | 170 +++------- buildstockbatch/local.py | 75 ++--- buildstockbatch/postprocessing.py | 227 +++---------- buildstockbatch/sampler/base.py | 12 +- buildstockbatch/sampler/commercial_sobol.py | 40 +-- buildstockbatch/sampler/downselect.py | 31 +- buildstockbatch/sampler/residential_quota.py | 9 +- buildstockbatch/test/conftest.py | 14 +- buildstockbatch/test/shared_testing_stuff.py | 4 +- buildstockbatch/test/test_base.py | 90 ++--- buildstockbatch/test/test_eagle.py | 134 ++------ buildstockbatch/test/test_local.py | 33 +- buildstockbatch/test/test_postprocessing.py | 26 +- buildstockbatch/test/test_validation.py | 96 ++---- buildstockbatch/utils.py | 29 +- .../workflow_generator/commercial.py | 32 +- .../workflow_generator/residential_hpxml.py | 79 ++--- .../test_workflow_generator.py | 129 ++------ docs/conf.py | 12 +- setup.py | 4 +- 24 files changed, 438 insertions(+), 1422 deletions(-) diff --git a/buildstockbatch/__version__.py b/buildstockbatch/__version__.py index b5750e31..b52fb766 100644 --- a/buildstockbatch/__version__.py +++ b/buildstockbatch/__version__.py @@ -9,6 +9,4 @@ __author__ = "Noel Merket" __author_email__ = "noel.merket@nrel.gov" __license__ = "BSD-3" -__copyright__ = "Copyright {} The Alliance for Sustainable Energy".format( - dt.date.today().year -) +__copyright__ = "Copyright {} The Alliance for Sustainable Energy".format(dt.date.today().year) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 5d02493b..d722d184 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -65,9 +65,7 @@ def backoff(thefunc, *args, **kwargs): caught_error = False for pat in error_patterns: if re.search(pat, error_code): - logger.debug( - f"{error_code}: Waiting and retrying in {delay} seconds" - ) + logger.debug(f"{error_code}: Waiting and retrying in {delay} seconds") caught_error = True time.sleep(delay) delay *= backoff_mult @@ -93,9 +91,7 @@ def filename_generator(): if filename.startswith("."): continue local_filepath = pathlib.Path(dirpath, filename) - s3_key = pathlib.PurePosixPath( - prefix, local_filepath.relative_to(local_dir_abs) - ) + s3_key = pathlib.PurePosixPath(prefix, local_filepath.relative_to(local_dir_abs)) yield local_filepath, s3_key logger.debug("Uploading {} => {}/{}".format(local_dir_abs, bucket, prefix)) @@ -138,9 +134,7 @@ def __init__(self, job_name, aws_config, boto3_session): self.batch = self.session.client("batch", config=boto_client_config) self.ec2 = self.session.client("ec2", config=boto_client_config) self.ec2r = self.session.resource("ec2", config=boto_client_config) - self.step_functions = self.session.client( - "stepfunctions", config=boto_client_config - ) + self.step_functions = self.session.client("stepfunctions", config=boto_client_config) self.aws_lambda = self.session.client("lambda", config=boto_client_config) self.s3 = self.session.client("s3", config=boto_client_config) self.s3_res = self.session.resource("s3", config=boto_client_config) @@ -276,9 +270,7 @@ def create_vpc(self): # Create the public subnet - pub_response = self.ec2.create_subnet( - CidrBlock=self.pub_subnet_cidr, VpcId=self.vpc_id - ) + pub_response = self.ec2.create_subnet(CidrBlock=self.pub_subnet_cidr, VpcId=self.vpc_id) logger.info("EIP allocated.") @@ -311,9 +303,7 @@ def create_vpc(self): # Create an internet gateway - self.ec2.attach_internet_gateway( - InternetGatewayId=self.internet_gateway_id, VpcId=self.vpc_id - ) + self.ec2.attach_internet_gateway(InternetGatewayId=self.internet_gateway_id, VpcId=self.vpc_id) logger.info("Internet Gateway attached.") @@ -348,9 +338,7 @@ def create_vpc(self): # Create a NAT Gateway - nat_response = self.ec2.create_nat_gateway( - AllocationId=self.nat_ip_allocation, SubnetId=self.pub_vpc_subnet_id - ) + nat_response = self.ec2.create_nat_gateway(AllocationId=self.nat_ip_allocation, SubnetId=self.pub_vpc_subnet_id) self.nat_gateway_id = nat_response["NatGateway"]["NatGatewayId"] @@ -378,14 +366,10 @@ def create_vpc(self): # Associate the private route to the private subnet - self.ec2.associate_route_table( - RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_1 - ) + self.ec2.associate_route_table(RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_1) logger.info("Route table associated with subnet.") - self.ec2.associate_route_table( - RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_2 - ) + self.ec2.associate_route_table(RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_2) logger.info("Route table associated with subnet.") # Associate the NAT gateway with the private route @@ -444,9 +428,7 @@ def create_batch_service_roles(self): self.batch_service_role_name, "batch", f"Service role for Batch environment {self.job_identifier}", - managed_policie_arns=[ - "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole" - ], + managed_policie_arns=["arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole"], ) # Instance Role for Batch compute environment @@ -455,17 +437,13 @@ def create_batch_service_roles(self): self.batch_instance_role_name, "ec2", f"Instance role for Batch compute environment {self.job_identifier}", - managed_policie_arns=[ - "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" - ], + managed_policie_arns=["arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"], ) # Instance Profile try: - response = self.iam.create_instance_profile( - InstanceProfileName=self.batch_instance_profile_name - ) + response = self.iam.create_instance_profile(InstanceProfileName=self.batch_instance_profile_name) self.instance_profile_arn = response["InstanceProfile"]["Arn"] @@ -479,9 +457,7 @@ def create_batch_service_roles(self): except Exception as e: if "EntityAlreadyExists" in str(e): logger.info("ECS Instance Profile not created - already exists") - response = self.iam.get_instance_profile( - InstanceProfileName=self.batch_instance_profile_name - ) + response = self.iam.get_instance_profile(InstanceProfileName=self.batch_instance_profile_name) self.instance_profile_arn = response["InstanceProfile"]["Arn"] # ECS Task Policy @@ -586,9 +562,7 @@ def create_batch_service_roles(self): self.batch_spot_service_role_name, "spotfleet", f"Spot Fleet role for Batch compute environment {self.job_identifier}", - managed_policie_arns=[ - "arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole" - ], + managed_policie_arns=["arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole"], ) def create_compute_environment(self, maxCPUs=10000): @@ -613,18 +587,13 @@ def create_compute_environment(self, maxCPUs=10000): }, ) except ClientError as error: - if ( - error.response["Error"]["Code"] - == "InvalidLaunchTemplateName.AlreadyExistsException" - ): + if error.response["Error"]["Code"] == "InvalidLaunchTemplateName.AlreadyExistsException": logger.debug("Launch template exists, skipping creation") else: raise error while True: - lt_resp = self.ec2.describe_launch_templates( - LaunchTemplateNames=[self.launch_template_name] - ) + lt_resp = self.ec2.describe_launch_templates(LaunchTemplateNames=[self.launch_template_name]) launch_templates = lt_resp["LaunchTemplates"] next_token = lt_resp.get("NextToken") while next_token: @@ -635,13 +604,9 @@ def create_compute_environment(self, maxCPUs=10000): launch_templates.extend(lt_resp["LaunchTemplates"]) next_token = lt_resp.get("NextToken") n_launch_templates = len(launch_templates) - assert ( - n_launch_templates <= 1 - ), f"There are {n_launch_templates} launch templates, this shouldn't happen." + assert n_launch_templates <= 1, f"There are {n_launch_templates} launch templates, this shouldn't happen." if n_launch_templates == 0: - logger.debug( - f"Waiting for the launch template {self.launch_template_name} to be created" - ) + logger.debug(f"Waiting for the launch template {self.launch_template_name} to be created") time.sleep(5) if n_launch_templates == 1: break @@ -673,9 +638,7 @@ def create_compute_environment(self, maxCPUs=10000): else: compute_resources["type"] = "EC2" - compute_resources["tags"] = self.get_tags( - Name=f"{self.job_identifier} batch instance" - ) + compute_resources["tags"] = self.get_tags(Name=f"{self.job_identifier} batch instance") self.batch.create_compute_environment( computeEnvironmentName=self.batch_compute_environment_name, @@ -686,15 +649,11 @@ def create_compute_environment(self, maxCPUs=10000): tags=self.get_tags(), ) - logger.info( - f"Compute environment {self.batch_compute_environment_name} created." - ) + logger.info(f"Compute environment {self.batch_compute_environment_name} created.") except Exception as e: if "Object already exists" in str(e): - logger.info( - f"Compute environment {self.batch_compute_environment_name} not created - already exists" - ) + logger.info(f"Compute environment {self.batch_compute_environment_name} not created - already exists") else: raise @@ -726,9 +685,7 @@ def create_job_queue(self): except Exception as e: if "Object already exists" in str(e): - logger.info( - f"Job queue {self.batch_job_queue_name} not created - already exists" - ) + logger.info(f"Job queue {self.batch_job_queue_name} not created - already exists") response = self.batch.describe_job_queues( jobQueues=[ self.batch_job_queue_name, @@ -739,10 +696,7 @@ def create_job_queue(self): elif "is not valid" in str(e): # Need to wait a second for the compute environment to complete registration - logger.warning( - "wating a few seconds for compute environment creation: " - + str(e) - ) + logger.warning("wating a few seconds for compute environment creation: " + str(e)) time.sleep(5) else: @@ -799,10 +753,7 @@ def submit_job(self, array_size=4): if "not in VALID state" in str(e): # Need to wait a second for the compute environment to complete registration - logger.warning( - "5 second sleep initiated to wait for job queue creation due to error: " - + str(e) - ) + logger.warning("5 second sleep initiated to wait for job queue creation due to error: " + str(e)) time.sleep(5) else: raise @@ -843,36 +794,26 @@ def clean(self): default_group_id = group["GroupId"] dsg = self.ec2r.SecurityGroup(default_group_id) if len(dsg.ip_permissions_egress): - response = dsg.revoke_egress( - IpPermissions=dsg.ip_permissions_egress - ) + response = dsg.revoke_egress(IpPermissions=dsg.ip_permissions_egress) try: - self.batch.update_job_queue( - jobQueue=self.batch_job_queue_name, state="DISABLED" - ) + self.batch.update_job_queue(jobQueue=self.batch_job_queue_name, state="DISABLED") while True: try: - response = self.batch.delete_job_queue( - jobQueue=self.batch_job_queue_name - ) + response = self.batch.delete_job_queue(jobQueue=self.batch_job_queue_name) logger.info(f"Job queue {self.batch_job_queue_name} deleted.") break except Exception as e: if "Cannot delete, resource is being modified" in str(e): - logger.info( - "Job queue being modified - sleeping until ready..." - ) + logger.info("Job queue being modified - sleeping until ready...") time.sleep(5) else: raise except Exception as e: if "does not exist" in str(e): - logger.info( - f"Job queue {self.batch_job_queue_name} missing, skipping..." - ) + logger.info(f"Job queue {self.batch_job_queue_name} missing, skipping...") # Delete compute enviornment @@ -887,38 +828,26 @@ def clean(self): response = self.batch.delete_compute_environment( computeEnvironment=self.batch_compute_environment_name ) - logger.info( - f"Compute environment {self.batch_compute_environment_name} deleted." - ) + logger.info(f"Compute environment {self.batch_compute_environment_name} deleted.") break except Exception as e: - if "Cannot delete, resource is being modified" in str( - e - ) or "found existing JobQueue" in str(e): - logger.info( - "Compute environment being modified - sleeping until ready..." - ) + if "Cannot delete, resource is being modified" in str(e) or "found existing JobQueue" in str(e): + logger.info("Compute environment being modified - sleeping until ready...") time.sleep(5) else: raise except Exception as e: if "does not exist" in str(e): - logger.info( - f"Compute environment {self.batch_compute_environment_name} missing, skipping..." - ) + logger.info(f"Compute environment {self.batch_compute_environment_name} missing, skipping...") else: raise # Delete Launch Template try: - self.ec2.delete_launch_template( - LaunchTemplateName=self.launch_template_name - ) + self.ec2.delete_launch_template(LaunchTemplateName=self.launch_template_name) except Exception as e: if "does not exist" in str(e): - logger.info( - f"Launch template {self.launch_template_name} does not exist, skipping..." - ) + logger.info(f"Launch template {self.launch_template_name} does not exist, skipping...") else: raise @@ -926,9 +855,7 @@ def clean(self): self.iam_helper.delete_role(self.batch_spot_service_role_name) self.iam_helper.delete_role(self.batch_ecs_task_role_name) # Instance profile order of removal - self.iam_helper.remove_role_from_instance_profile( - self.batch_instance_profile_name - ) + self.iam_helper.remove_role_from_instance_profile(self.batch_instance_profile_name) self.iam_helper.delete_role(self.batch_instance_role_name) self.iam_helper.delete_instance_profile(self.batch_instance_profile_name) @@ -948,9 +875,7 @@ def clean(self): for vpc in response["Vpcs"]: this_vpc = vpc["VpcId"] - s3gw_response = self.ec2.describe_vpc_endpoints( - Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] - ) + s3gw_response = self.ec2.describe_vpc_endpoints(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) for s3gw in s3gw_response["VpcEndpoints"]: this_s3gw = s3gw["VpcEndpointId"] @@ -958,9 +883,7 @@ def clean(self): if s3gw["State"] != "deleted": self.ec2.delete_vpc_endpoints(VpcEndpointIds=[this_s3gw]) - ng_response = self.ec2.describe_nat_gateways( - Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] - ) + ng_response = self.ec2.describe_nat_gateways(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) for natgw in ng_response["NatGateways"]: this_natgw = natgw["NatGatewayId"] @@ -968,9 +891,7 @@ def clean(self): if natgw["State"] != "deleted": self.ec2.delete_nat_gateway(NatGatewayId=this_natgw) - rtas_response = self.ec2.describe_route_tables( - Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] - ) + rtas_response = self.ec2.describe_route_tables(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) for route_table in rtas_response["RouteTables"]: route_table_id = route_table["RouteTableId"] @@ -982,9 +903,7 @@ def clean(self): rt_counter = 10 while rt_counter: try: - response = self.ec2.delete_route_table( - RouteTableId=route_table_id - ) + response = self.ec2.delete_route_table(RouteTableId=route_table_id) logger.info("Route table removed.") break except Exception as e: @@ -1008,20 +927,14 @@ def clean(self): try: try: self.ec2.detach_internet_gateway( - InternetGatewayId=internet_gateway[ - "InternetGatewayId" - ], + InternetGatewayId=internet_gateway["InternetGatewayId"], VpcId=attachment["VpcId"], ) except Exception as e: - logger.info( - f"Error on Internet Gateway disassociation - ignoring... {str(e)}" - ) + logger.info(f"Error on Internet Gateway disassociation - ignoring... {str(e)}") self.ec2.delete_internet_gateway( - InternetGatewayId=internet_gateway[ - "InternetGatewayId" - ] + InternetGatewayId=internet_gateway["InternetGatewayId"] ) logger.info("Internet Gateway deleted.") break @@ -1035,9 +948,7 @@ def clean(self): else: raise - subn_response = self.ec2.describe_subnets( - Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] - ) + subn_response = self.ec2.describe_subnets(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) for subnet in subn_response["Subnets"]: while True: @@ -1046,9 +957,7 @@ def clean(self): break except Exception as e: if "DependencyViolation" in str(e): - logger.info( - "Subnet cannot be deleted as dependencies are still being deleted. Sleeping..." - ) + logger.info("Subnet cannot be deleted as dependencies are still being deleted. Sleeping...") time.sleep(10) else: raise @@ -1091,12 +1000,8 @@ def __init__(self, project_filename): try: self.docker_client.ping() except: # noqa: E722 (allow bare except in this case because error can be a weird non-class Windows API error) - logger.error( - "The docker server did not respond, make sure Docker Desktop is started then retry." - ) - raise RuntimeError( - "The docker server did not respond, make sure Docker Desktop is started then retry." - ) + logger.error("The docker server did not respond, make sure Docker Desktop is started then retry.") + raise RuntimeError("The docker server did not respond, make sure Docker Desktop is started then retry.") @staticmethod def validate_project(project_file): @@ -1112,15 +1017,11 @@ class AwsBatch(DockerBatchBase): def __init__(self, project_filename): super().__init__(project_filename) - self.job_identifier = re.sub( - "[^0-9a-zA-Z]+", "_", self.cfg["aws"]["job_identifier"] - )[:10] + self.job_identifier = re.sub("[^0-9a-zA-Z]+", "_", self.cfg["aws"]["job_identifier"])[:10] self.project_filename = project_filename self.region = self.cfg["aws"]["region"] - self.ecr = boto3.client( - "ecr", region_name=self.region, config=boto_client_config - ) + self.ecr = boto3.client("ecr", region_name=self.region, config=boto_client_config) self.s3 = boto3.client("s3", region_name=self.region, config=boto_client_config) self.s3_bucket = self.cfg["aws"]["s3"]["bucket"] self.s3_bucket_prefix = self.cfg["aws"]["s3"]["prefix"].rstrip("/") @@ -1132,9 +1033,7 @@ def __init__(self, project_filename): def validate_dask_settings(project_file): cfg = get_project_configuration(project_file) if "emr" in cfg["aws"]: - logger.warning( - "The `aws.emr` configuration is no longer used and is ignored. Recommend removing." - ) + logger.warning("The `aws.emr` configuration is no longer used and is ignored. Recommend removing.") dask_cfg = cfg["aws"]["dask"] errors = [] mem_rules = { @@ -1147,22 +1046,16 @@ def validate_dask_settings(project_file): for node_type in ("scheduler", "worker"): mem = dask_cfg.get(f"{node_type}_memory", 8 * 1024) if mem % 1024 != 0: - errors.append( - f"`aws.dask.{node_type}_memory` = {mem}, needs to be a multiple of 1024." - ) + errors.append(f"`aws.dask.{node_type}_memory` = {mem}, needs to be a multiple of 1024.") mem_gb = mem // 1024 - min_gb, max_gb, incr_gb = mem_rules[ - dask_cfg.get(f"{node_type}_cpu", 2 * 1024) - ] + min_gb, max_gb, incr_gb = mem_rules[dask_cfg.get(f"{node_type}_cpu", 2 * 1024)] if not (min_gb <= mem_gb <= max_gb and (mem_gb - min_gb) % incr_gb == 0): errors.append( f"`aws.dask.{node_type}_memory` = {mem}, " f"should be between {min_gb * 1024} and {max_gb * 1024} in a multiple of {incr_gb * 1024}." ) if errors: - errors.append( - "See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html" - ) + errors.append("See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html") raise ValidationError("\n".join(errors)) return True @@ -1211,32 +1104,22 @@ def build_image(self): """ root_path = pathlib.Path(os.path.abspath(__file__)).parent.parent.parent if not (root_path / "Dockerfile").exists(): - raise RuntimeError( - f"The needs to be run from the root of the repo, found {root_path}" - ) + raise RuntimeError(f"The needs to be run from the root of the repo, found {root_path}") # Make the buildstock/resources/.aws_docker_image dir to store logs - local_log_dir = os.path.join( - self.buildstock_dir, "resources", ".aws_docker_image" - ) + local_log_dir = os.path.join(self.buildstock_dir, "resources", ".aws_docker_image") if not os.path.exists(local_log_dir): os.makedirs(local_log_dir) # Determine whether or not to build the image with custom gems bundled in if self.cfg.get("baseline", dict()).get("custom_gems", False): # Ensure the custom Gemfile exists in the buildstock dir - local_gemfile_path = os.path.join( - self.buildstock_dir, "resources", "Gemfile" - ) + local_gemfile_path = os.path.join(self.buildstock_dir, "resources", "Gemfile") if not os.path.exists(local_gemfile_path): - raise AttributeError( - f"baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}" - ) + raise AttributeError(f"baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}") # Copy the custom Gemfile into the buildstockbatch repo - bsb_root = os.path.join( - os.path.abspath(__file__), os.pardir, os.pardir, os.pardir - ) + bsb_root = os.path.join(os.path.abspath(__file__), os.pardir, os.pardir, os.pardir) new_gemfile_path = os.path.join(bsb_root, "Gemfile") shutil.copyfile(local_gemfile_path, new_gemfile_path) logger.info(f"Copying custom Gemfile from {local_gemfile_path}") @@ -1249,9 +1132,7 @@ def build_image(self): # which stops before bundling custom gems into the image stage = "buildstockbatch" - logger.info( - f"Building docker image stage: {stage} from OpenStudio {self.os_version}" - ) + logger.info(f"Building docker image stage: {stage} from OpenStudio {self.os_version}") img, build_logs = self.docker_client.images.build( path=str(root_path), tag=self.docker_image, @@ -1303,22 +1184,16 @@ def push_image(self): """ auth_token = self.ecr.get_authorization_token() dkr_user, dkr_pass = ( - base64.b64decode(auth_token["authorizationData"][0]["authorizationToken"]) - .decode("ascii") - .split(":") + base64.b64decode(auth_token["authorizationData"][0]["authorizationToken"]).decode("ascii").split(":") ) repo_url = self.container_repo["repositoryUri"] registry_url = "https://" + repo_url.split("/")[0] - resp = self.docker_client.login( - username=dkr_user, password=dkr_pass, registry=registry_url - ) + resp = self.docker_client.login(username=dkr_user, password=dkr_pass, registry=registry_url) logger.debug(resp) image = self.docker_client.images.get(self.docker_image) image.tag(repo_url, tag=self.job_identifier) last_status = None - for x in self.docker_client.images.push( - repo_url, tag=self.job_identifier, stream=True - ): + for x in self.docker_client.images.push(repo_url, tag=self.job_identifier, stream=True): try: y = json.loads(x) except json.JSONDecodeError: @@ -1335,9 +1210,7 @@ def clean(self): """ logger.info("Beginning cleanup of AWS resources...") - batch_env = AwsBatchEnv( - self.job_identifier, self.cfg["aws"], self.boto3_session - ) + batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) batch_env.clean() def run_batch(self): @@ -1354,9 +1227,7 @@ def run_batch(self): buildstock_csv_filename = self.sampler.run_sampling() # Compress and upload assets to S3 - with tempfile.TemporaryDirectory( - prefix="bsb_" - ) as tmpdir, tempfile.TemporaryDirectory( + with tempfile.TemporaryDirectory(prefix="bsb_") as tmpdir, tempfile.TemporaryDirectory( prefix="bsb_" ) as tmp_weather_dir: # noqa: E501 self._weather_dir = tmp_weather_dir @@ -1383,14 +1254,10 @@ def run_batch(self): os.makedirs(weather_path) # Determine the unique weather files - epw_filenames = list( - filter(lambda x: x.endswith(".epw"), os.listdir(self.weather_dir)) - ) + epw_filenames = list(filter(lambda x: x.endswith(".epw"), os.listdir(self.weather_dir))) logger.debug("Calculating hashes for weather files") epw_hashes = Parallel(n_jobs=-1, verbose=9)( - delayed(calc_hash_for_file)( - pathlib.Path(self.weather_dir) / epw_filename - ) + delayed(calc_hash_for_file)(pathlib.Path(self.weather_dir) / epw_filename) for epw_filename in epw_filenames ) unique_epws = collections.defaultdict(list) @@ -1421,14 +1288,10 @@ def run_batch(self): n_sims_per_job = math.ceil(n_sims / self.batch_array_size) n_sims_per_job = max(n_sims_per_job, 2) - logger.debug( - "Number of simulations per array job = {}".format(n_sims_per_job) - ) + logger.debug("Number of simulations per array job = {}".format(n_sims_per_job)) baseline_sims = zip(building_ids, itertools.repeat(None)) - upgrade_sims = itertools.product( - building_ids, range(len(self.cfg.get("upgrades", []))) - ) + upgrade_sims = itertools.product(building_ids, range(len(self.cfg.get("upgrades", [])))) all_sims = list(itertools.chain(baseline_sims, upgrade_sims)) random.shuffle(all_sims) all_sims_iter = iter(all_sims) @@ -1461,9 +1324,7 @@ def run_batch(self): with tarfile.open(tmppath / "jobs.tar.gz", "w:gz") as tf: tf.add(jobs_dir, arcname="jobs") tick = time.time() - tick - logger.debug( - "Done compressing job jsons using gz {:.1f} seconds".format(tick) - ) + logger.debug("Done compressing job jsons using gz {:.1f} seconds".format(tick)) shutil.rmtree(jobs_dir) os.makedirs(tmppath / "results" / "simulation_output") @@ -1489,10 +1350,7 @@ def run_batch(self): logger.debug("Copying weather files on S3") bucket = self.cfg["aws"]["s3"]["bucket"] - Parallel(n_jobs=-1, verbose=9)( - delayed(copy_s3_file)(bucket, src, bucket, dest) - for src, dest in epws_to_copy - ) + Parallel(n_jobs=-1, verbose=9)(delayed(copy_s3_file)(bucket, src, bucket, dest) for src, dest in epws_to_copy) # Create the output directories fs = S3FileSystem() @@ -1502,9 +1360,7 @@ def run_batch(self): ) # noqa E501 # Define the batch environment - batch_env = AwsBatchEnv( - self.job_identifier, self.cfg["aws"], self.boto3_session - ) + batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) logger.info( "Launching Batch environment - (resource configs will not be updated on subsequent executions, but new job revisions will be created):" # noqa 501 ) @@ -1543,9 +1399,7 @@ def run_batch(self): job_desc_resp = batch_env.batch.describe_jobs(jobs=[job_info["jobId"]]) job_status = job_desc_resp["jobs"][0]["status"] - jobs_resp = batch_env.batch.list_jobs( - arrayJobId=job_info["jobId"], jobStatus="SUCCEEDED" - ) + jobs_resp = batch_env.batch.list_jobs(arrayJobId=job_info["jobId"], jobStatus="SUCCEEDED") n_succeeded = len(jobs_resp["jobSummaryList"]) next_token = jobs_resp.get("nextToken") while next_token is not None: @@ -1594,9 +1448,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): jobs_file_path = sim_dir.parent / "jobs.tar.gz" s3.download_file(bucket, f"{prefix}/jobs.tar.gz", str(jobs_file_path)) with tarfile.open(jobs_file_path, "r") as tar_f: - jobs_d = json.load( - tar_f.extractfile(f"jobs/job{job_id:05d}.json"), encoding="utf-8" - ) + jobs_d = json.load(tar_f.extractfile(f"jobs/job{job_id:05d}.json"), encoding="utf-8") logger.debug("Number of simulations = {}".format(len(jobs_d["batch"]))) logger.debug("Getting weather files") @@ -1604,9 +1456,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): os.makedirs(weather_dir, exist_ok=True) # Make a lookup of which parameter points to the weather file from options_lookup.tsv - with open( - sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8" - ) as f: + with open(sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8") as f: tsv_reader = csv.reader(f, delimiter="\t") next(tsv_reader) # skip headers param_name = None @@ -1618,9 +1468,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): raise RuntimeError( f"The epw files are specified in options_lookup.tsv under more than one parameter type: {param_name}, {row[0]}" ) # noqa: E501 - epw_filename = ( - row[row_has_epw.index(True) + 2].split("=")[1].split("/")[-1] - ) + epw_filename = row[row_has_epw.index(True) + 2].split("=")[1].split("/")[-1] param_name = row[0] option_name = row[1] epws_by_option[option_name] = epw_filename @@ -1659,9 +1507,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): sim_id = f"bldg{building_id:07d}up{upgrade_id:02d}" # Create OSW - osw = cls.create_osw( - cfg, jobs_d["n_datapoints"], sim_id, building_id, upgrade_idx - ) + osw = cls.create_osw(cfg, jobs_d["n_datapoints"], sim_id, building_id, upgrade_idx) with open(os.path.join(sim_dir, "in.osw"), "w") as f: json.dump(osw, f, indent=4) @@ -1756,9 +1602,7 @@ def get_fs(self): def get_dask_client(self): dask_cfg = self.cfg["aws"]["dask"] - batch_env = AwsBatchEnv( - self.job_identifier, self.cfg["aws"], self.boto3_session - ) + batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) m = 1024 self.dask_cluster = FargateCluster( region_name=self.region, diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index 01193726..a5a2bd74 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -70,9 +70,7 @@ def role_stitcher( for managed_policy_arn in managed_policie_arns: - response = self.iam.attach_role_policy( - PolicyArn=managed_policy_arn, RoleName=role_name - ) + response = self.iam.attach_role_policy(PolicyArn=managed_policy_arn, RoleName=role_name) logger.info(f"Role {role_name} created") @@ -103,9 +101,7 @@ def delete_role(self, role_name): response = self.iam.list_attached_role_policies(RoleName=role_name) for policy in response["AttachedPolicies"]: - self.iam.detach_role_policy( - RoleName=role_name, PolicyArn=policy["PolicyArn"] - ) + self.iam.detach_role_policy(RoleName=role_name, PolicyArn=policy["PolicyArn"]) logger.info(f"Policies detached from role {role_name}.") @@ -124,17 +120,13 @@ def delete_instance_profile(self, instance_profile_name): logger.info(f"Instance profile {instance_profile_name} deleted.") except Exception as e: if "NoSuchEntity" in str(e): - logger.info( - f"Instance profile {instance_profile_name} missing, skipping..." - ) + logger.info(f"Instance profile {instance_profile_name} missing, skipping...") else: raise def remove_role_from_instance_profile(self, instance_profile_name): try: - response = self.iam.get_instance_profile( - InstanceProfileName=instance_profile_name - ) + response = self.iam.get_instance_profile(InstanceProfileName=instance_profile_name) for role in response["InstanceProfile"]["Roles"]: response = self.iam.remove_role_from_instance_profile( @@ -143,9 +135,7 @@ def remove_role_from_instance_profile(self, instance_profile_name): logger.info(f"Roles removed from instance profile {instance_profile_name}") except Exception as e: if "NoSuchEntity" in str(e): - logger.info( - f"Instance profile {instance_profile_name} does not exist. Skipping..." - ) + logger.info(f"Instance profile {instance_profile_name} does not exist. Skipping...") else: raise @@ -161,11 +151,7 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.iam = self.iam_helper.iam self.s3 = self.session.client("s3", config=boto_client_config) self.job_identifier = job_identifier - self.account = ( - self.session.client("sts", config=boto_client_config) - .get_caller_identity() - .get("Account") - ) + self.account = self.session.client("sts", config=boto_client_config).get_caller_identity().get("Account") self.region = aws_config["region"] self.operator_email = aws_config["notifications_email"] @@ -173,12 +159,8 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.s3_bucket = aws_config["s3"]["bucket"] self.s3_bucket_arn = f"arn:aws:s3:::{self.s3_bucket}" self.s3_bucket_prefix = aws_config["s3"]["prefix"].rstrip("/") - self.s3_lambda_code_emr_cluster_key = ( - f"{self.s3_bucket_prefix}/lambda_functions/emr_function.py.zip" - ) - self.s3_lambda_emr_config_key = ( - f"{self.s3_bucket_prefix}/lambda_functions/emr_config.json" - ) + self.s3_lambda_code_emr_cluster_key = f"{self.s3_bucket_prefix}/lambda_functions/emr_function.py.zip" + self.s3_lambda_emr_config_key = f"{self.s3_bucket_prefix}/lambda_functions/emr_config.json" self.s3_emr_folder_name = "emr" # Batch @@ -187,9 +169,7 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.batch_job_queue_name = f"job_queue_{self.job_identifier}" self.batch_service_role_name = f"batch_service_role_{self.job_identifier}" self.batch_instance_role_name = f"batch_instance_role_{self.job_identifier}" - self.batch_instance_profile_name = ( - f"batch_instance_profile_{self.job_identifier}" - ) + self.batch_instance_profile_name = f"batch_instance_profile_{self.job_identifier}" self.batch_spot_service_role_name = f"spot_fleet_role_{self.job_identifier}" self.batch_ecs_task_role_name = f"ecs_task_role_{self.job_identifier}" self.batch_task_policy_name = f"ecs_task_policy_{self.job_identifier}" diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index aa457a84..b7e9fe6a 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -61,37 +61,26 @@ def __init__(self, project_filename): self.buildstock_dir = self.cfg["buildstock_directory"] if not os.path.isdir(self.buildstock_dir): - raise FileNotFoundError( - f"buildstock_directory = {self.buildstock_dir} is not a directory." - ) - self.project_dir = os.path.join( - self.buildstock_dir, self.cfg["project_directory"] - ) + raise FileNotFoundError(f"buildstock_directory = {self.buildstock_dir} is not a directory.") + self.project_dir = os.path.join(self.buildstock_dir, self.cfg["project_directory"]) if not os.path.isdir(self.project_dir): - raise FileNotFoundError( - f"project_directory = {self.project_dir} is not a directory." - ) + raise FileNotFoundError(f"project_directory = {self.project_dir} is not a directory.") # Load in OS_VERSION and OS_SHA arguments if they exist in the YAML, # otherwise use defaults specified here. self.os_version = self.cfg.get("os_version", self.DEFAULT_OS_VERSION) self.os_sha = self.cfg.get("os_sha", self.DEFAULT_OS_SHA) - logger.debug( - f"Using OpenStudio version: {self.os_version} with SHA: {self.os_sha}" - ) + logger.debug(f"Using OpenStudio version: {self.os_version} with SHA: {self.os_sha}") @staticmethod def get_sampler_class(sampler_name): - sampler_class_name = ( - "".join(x.capitalize() for x in sampler_name.strip().split("_")) + "Sampler" - ) + sampler_class_name = "".join(x.capitalize() for x in sampler_name.strip().split("_")) + "Sampler" return getattr(sampler, sampler_class_name) @staticmethod def get_workflow_generator_class(workflow_generator_name): workflow_generator_class_name = ( - "".join(x.capitalize() for x in workflow_generator_name.strip().split("_")) - + "WorkflowGenerator" + "".join(x.capitalize() for x in workflow_generator_name.strip().split("_")) + "WorkflowGenerator" ) return getattr(workflow_generator, workflow_generator_class_name) @@ -124,9 +113,7 @@ def _get_weather_files(self): f.write(chunk) f.seek(0) with zipfile.ZipFile(f, "r") as zf: - logger.debug( - "Extracting weather files to: {}".format(self.weather_dir) - ) + logger.debug("Extracting weather files to: {}".format(self.weather_dir)) zf.extractall(self.weather_dir) @property @@ -148,12 +135,8 @@ def skip_baseline_sims(self): @classmethod def get_reporting_measures(cls, cfg): - WorkflowGenerator = cls.get_workflow_generator_class( - cfg["workflow_generator"]["type"] - ) - wg = WorkflowGenerator( - cfg, 1 - ) # Number of datapoints doesn't really matter here + WorkflowGenerator = cls.get_workflow_generator_class(cfg["workflow_generator"]["type"]) + wg = WorkflowGenerator(cfg, 1) # Number of datapoints doesn't really matter here return wg.reporting_measures() def run_batch(self): @@ -161,9 +144,7 @@ def run_batch(self): @classmethod def create_osw(cls, cfg, n_datapoints, *args, **kwargs): - WorkflowGenerator = cls.get_workflow_generator_class( - cfg["workflow_generator"]["type"] - ) + WorkflowGenerator = cls.get_workflow_generator_class(cfg["workflow_generator"]["type"]) osw_generator = WorkflowGenerator(cfg, n_datapoints) return osw_generator.create_osw(*args, **kwargs) @@ -186,9 +167,7 @@ def make_sim_dir(building_id, upgrade_idx, base_dir, overwrite_existing=False): sim_dir, ) elif os.path.exists(os.path.join(sim_dir, "run", "failed.job")): - raise SimulationExists( - "{} exists and failed".format(sim_id), sim_id, sim_dir - ) + raise SimulationExists("{} exists and failed".format(sim_id), sim_id, sim_dir) else: shutil.rmtree(sim_dir) @@ -234,21 +213,13 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id): if os.path.isfile(timeseries_filepath): # Find the time columns present in the enduse_timeseries file possible_time_cols = ["time", "Time", "TimeDST", "TimeUTC"] - cols = read_csv( - timeseries_filepath, index_col=False, nrows=0 - ).columns.tolist() + cols = read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist() actual_time_cols = [c for c in cols if c in possible_time_cols] if not actual_time_cols: - logger.error( - f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}." - ) - raise RuntimeError( - f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}." - ) + logger.error(f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.") + raise RuntimeError(f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.") - tsdf = read_csv( - timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows - ) + tsdf = read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows) if os.path.isfile(schedules_filepath): schedules = read_csv(schedules_filepath, dtype=np.float64) schedules.rename(columns=lambda x: f"schedules_{x}", inplace=True) @@ -316,9 +287,7 @@ def get_buildstock_dir(project_file, cfg): if os.path.isabs(buildstock_dir): return os.path.abspath(buildstock_dir) else: - return os.path.abspath( - os.path.join(os.path.dirname(project_file), buildstock_dir) - ) + return os.path.abspath(os.path.join(os.path.dirname(project_file), buildstock_dir)) @classmethod def validate_openstudio_path(cls, project_file): @@ -334,14 +303,10 @@ def validate_openstudio_path(cls, project_file): except FileNotFoundError: raise ValidationError(f"Cannot find openstudio at `{cls.openstudio_exe()}`") if proc_out.returncode != 0: - raise ValidationError( - f"OpenStudio failed with the following error {proc_out.stderr}" - ) + raise ValidationError(f"OpenStudio failed with the following error {proc_out.stderr}") actual_os_version, actual_os_sha = proc_out.stdout.strip().split("+") if os_version != actual_os_version: - raise ValidationError( - f"OpenStudio version is {actual_os_version}, expected is {os_version}" - ) + raise ValidationError(f"OpenStudio version is {actual_os_version}, expected is {os_version}") if os_sha != actual_os_sha: raise ValidationError( f"OpenStudio version is correct at {os_version}, but the shas don't match. " @@ -366,9 +331,7 @@ def validate_sampler(project_file): else: sample_file = os.path.abspath(sample_file) buildstock_df = read_csv(sample_file, dtype=str) - return BuildStockBatchBase.validate_buildstock_csv( - project_file, buildstock_df - ) + return BuildStockBatchBase.validate_buildstock_csv(project_file, buildstock_df) return True @staticmethod @@ -381,9 +344,7 @@ def validate_buildstock_csv(project_file, buildstock_df): if column in {"Building"}: continue if column not in param_option_dict: - errors.append( - f"Column {column} in buildstock_csv is not available in options_lookup.tsv" - ) + errors.append(f"Column {column} in buildstock_csv is not available in options_lookup.tsv") continue if "*" in param_option_dict[column]: continue # skip validating options when wildcard is present @@ -401,22 +362,16 @@ def validate_buildstock_csv(project_file, buildstock_df): @classmethod def validate_workflow_generator(cls, project_file): cfg = get_project_configuration(project_file) - WorkflowGenerator = cls.get_workflow_generator_class( - cfg["workflow_generator"]["type"] - ) + WorkflowGenerator = cls.get_workflow_generator_class(cfg["workflow_generator"]["type"]) return WorkflowGenerator.validate(cfg) @staticmethod def validate_project_schema(project_file): cfg = get_project_configuration(project_file) schema_version = cfg.get("schema_version") - version_schema = os.path.join( - os.path.dirname(__file__), "schemas", f"v{schema_version}.yaml" - ) + version_schema = os.path.join(os.path.dirname(__file__), "schemas", f"v{schema_version}.yaml") if not os.path.isfile(version_schema): - logger.error( - f"Could not find validation schema for YAML version {schema_version}" - ) + logger.error(f"Could not find validation schema for YAML version {schema_version}") raise FileNotFoundError(version_schema) schema = yamale.make_schema(version_schema) data = yamale.make_data(project_file, parser="ruamel") @@ -436,9 +391,7 @@ def validate_postprocessing_spec(project_file): partition_cols = cfg.get("postprocessing", {}).get("partition_columns", []) invalid_cols = [c for c in partition_cols if c not in param_option_dict.keys()] if invalid_cols: - raise ValidationError( - f"The following partition columns are not valid: {invalid_cols}" - ) + raise ValidationError(f"The following partition columns are not valid: {invalid_cols}") return True @staticmethod @@ -448,12 +401,8 @@ def validate_xor_nor_schema_keys(project_file): if int(major) >= 0: if int(minor) >= 0: # xor - if ("weather_files_url" in cfg.keys()) is ( - "weather_files_path" in cfg.keys() - ): - raise ValidationError( - "Both/neither weather_files_url and weather_files_path found in yaml root" - ) + if ("weather_files_url" in cfg.keys()) is ("weather_files_path" in cfg.keys()): + raise ValidationError("Both/neither weather_files_url and weather_files_path found in yaml root") return True @@ -468,9 +417,7 @@ def get_param_option_dict(project_file): try: with open(options_lookup_path, "r") as f: options = csv.DictReader(f, delimiter="\t") - invalid_options_lookup_str = ( - "" # Holds option/parameter names with invalid characters - ) + invalid_options_lookup_str = "" # Holds option/parameter names with invalid characters for row in options: for col in ["Parameter Name", "Option Name"]: invalid_chars = set(row[col]).intersection(set("|&()")) @@ -480,16 +427,9 @@ def get_param_option_dict(project_file): param_name, opt_name = row["Parameter Name"], row["Option Name"] param_option_dict[row["Parameter Name"]].add(row["Option Name"]) if opt_name == "*" and row["Measure Dir"]: - invalid_options_lookup_str += ( - f"{param_name}: '*' cannot pass arguments to measure.\n" - ) - if ( - "*" in param_option_dict[param_name] - and len(param_option_dict[param_name]) > 1 - ): - invalid_options_lookup_str += ( - f"{param_name}: '*' cannot be mixed with other options\n" - ) + invalid_options_lookup_str += f"{param_name}: '*' cannot pass arguments to measure.\n" + if "*" in param_option_dict[param_name] and len(param_option_dict[param_name]) > 1: + invalid_options_lookup_str += f"{param_name}: '*' cannot be mixed with other options\n" except FileNotFoundError as err: logger.error(f"Options lookup file not found at: '{options_lookup_path}'") raise err @@ -501,9 +441,7 @@ def validate_options_lookup(project_file): Validates that the parameter|options specified in the project yaml file is available in the options_lookup.tsv """ cfg = get_project_configuration(project_file) - param_option_dict, invalid_options_lookup_str = ( - BuildStockBatchBase.get_param_option_dict(project_file) - ) + param_option_dict, invalid_options_lookup_str = BuildStockBatchBase.get_param_option_dict(project_file) invalid_option_spec_counter = Counter() invalid_param_counter = Counter() invalid_option_counter_dict = defaultdict(Counter) @@ -518,9 +456,7 @@ def get_errors(source_str, option_str): if not returns error message, close matches, and specifies where the error occurred (source_str) """ if "||" in option_str and "&&" in option_str: - invalid_option_spec_counter[ - (option_str, "has both || and && (not supported)") - ] += 1 + invalid_option_spec_counter[(option_str, "has both || and && (not supported)")] += 1 return "" if "||" in option_str or "&&" in option_str: @@ -528,9 +464,7 @@ def get_errors(source_str, option_str): errors = "" broken_options = option_str.split(splitter) if broken_options[-1] == "": - invalid_option_spec_counter[ - (option_str, "has trailing 'splitter'") - ] += 1 + invalid_option_spec_counter[(option_str, "has trailing 'splitter'")] += 1 return "" for broken_option_str in broken_options: new_source_str = source_str + f" in composite option '{option_str}'" @@ -552,21 +486,15 @@ def get_errors(source_str, option_str): return "" if parameter_name not in param_option_dict: - close_match = difflib.get_close_matches( - parameter_name, param_option_dict.keys(), 1 - ) + close_match = difflib.get_close_matches(parameter_name, param_option_dict.keys(), 1) close_match = close_match[0] if close_match else "" invalid_param_counter[(parameter_name, close_match)] += 1 return "" if not option_name or option_name not in param_option_dict[parameter_name]: - close_match = difflib.get_close_matches( - option_name, list(param_option_dict[parameter_name]), 1 - ) + close_match = difflib.get_close_matches(option_name, list(param_option_dict[parameter_name]), 1) close_match = close_match[0] if close_match else "" - invalid_option_counter_dict[parameter_name][ - (option_name, close_match) - ] += 1 + invalid_option_counter_dict[parameter_name][(option_name, close_match)] += 1 return "" return "" @@ -586,62 +514,38 @@ def get_all_option_str(source_str, inp): return [(source_str, inp)] elif type(inp) == list: return sum( - [ - get_all_option_str(source_str + f", in entry {count}", entry) - for count, entry in enumerate(inp) - ], + [get_all_option_str(source_str + f", in entry {count}", entry) for count, entry in enumerate(inp)], [], ) elif type(inp) == dict: if len(inp) > 1: - raise ValidationError( - f"{source_str} the logic is malformed. Dict can't have more than one entry" - ) + raise ValidationError(f"{source_str} the logic is malformed. Dict can't have more than one entry") source_str += f", in {list(inp.keys())[0]}" - return sum( - [get_all_option_str(source_str, i) for i in inp.values()], [] - ) + return sum([get_all_option_str(source_str, i) for i in inp.values()], []) # store all of the option_str in the project file as a list of (source_str, option_str) tuple source_option_str_list = [] if "upgrades" in cfg: for upgrade_count, upgrade in enumerate(cfg["upgrades"]): - upgrade_name = ( - upgrade.get("upgrade_name", "") - + f" (Upgrade Number: {upgrade_count})" - ) + upgrade_name = upgrade.get("upgrade_name", "") + f" (Upgrade Number: {upgrade_count})" source_str_upgrade = f"In upgrade '{upgrade_name}'" for option_count, option in enumerate(upgrade["options"]): - option_name = ( - option.get("option", "") + f" (Option Number: {option_count})" - ) - source_str_option = ( - source_str_upgrade + f", in option '{option_name}'" - ) - source_option_str_list.append( - (source_str_option, option.get("option")) - ) + option_name = option.get("option", "") + f" (Option Number: {option_count})" + source_str_option = source_str_upgrade + f", in option '{option_name}'" + source_option_str_list.append((source_str_option, option.get("option"))) if "apply_logic" in option: source_str_logic = source_str_option + ", in apply_logic" - source_option_str_list += get_all_option_str( - source_str_logic, option["apply_logic"] - ) + source_option_str_list += get_all_option_str(source_str_logic, option["apply_logic"]) if "package_apply_logic" in upgrade: source_str_package = source_str_upgrade + ", in package_apply_logic" - source_option_str_list += get_all_option_str( - source_str_package, upgrade["package_apply_logic"] - ) + source_option_str_list += get_all_option_str(source_str_package, upgrade["package_apply_logic"]) # TODO: refactor this into Sampler.validate_args if "downselect" in cfg or "downselect" in cfg.get("sampler", {}).get("type"): source_str = "In downselect" - logic = ( - cfg["downselect"]["logic"] - if "downselect" in cfg - else cfg["sampler"]["args"]["logic"] - ) + logic = cfg["downselect"]["logic"] if "downselect" in cfg else cfg["sampler"]["args"]["logic"] source_option_str_list += get_all_option_str(source_str, logic) # Gather all the errors in the option_str, if any @@ -650,11 +554,7 @@ def get_all_option_str(source_str, inp): error_message += get_errors(source_str, option_str) if error_message: - error_message = ( - "Following option/parameter entries have problem:\n" - + error_message - + "\n" - ) + error_message = "Following option/parameter entries have problem:\n" + error_message + "\n" if invalid_option_spec_counter: error_message += "* Following option/parameter entries have problem:\n" @@ -662,9 +562,7 @@ def get_all_option_str(source_str, inp): error_message += f" '{invalid_entry}' {error} - used '{count}' times\n" if invalid_param_counter: - error_message += ( - "* Following parameters do not exist in options_lookup.tsv\n" - ) + error_message += "* Following parameters do not exist in options_lookup.tsv\n" for (param, close_match), count in invalid_param_counter.items(): error_message += f" '{param}' - used '{count}' times." if close_match: @@ -736,9 +634,7 @@ def get_logic_problems(logic, parent=None): assert len(logic) == 1 for key, val in logic.items(): if key not in ["or", "and", "not"]: - raise ValidationError( - f"Invalid key {key}. Only 'or', 'and' and 'not' is allowed." - ) + raise ValidationError(f"Invalid key {key}. Only 'or', 'and' and 'not' is allowed.") return get_logic_problems(val, parent=key) elif isinstance(logic, str): if "&&" not in logic: @@ -746,28 +642,19 @@ def get_logic_problems(logic, parent=None): entries = logic.split("&&") return get_logic_problems(entries, parent="&&") else: - raise ValidationError( - f"Invalid logic element {logic} with type {type(logic)}" - ) + raise ValidationError(f"Invalid logic element {logic} with type {type(logic)}") all_problems = [] if "upgrades" in cfg: for upgrade_count, upgrade in enumerate(cfg["upgrades"]): upgrade_name = upgrade.get("upgrade_name", "") - source_str_upgrade = ( - f"upgrade '{upgrade_name}' (Upgrade Number:{upgrade_count})" - ) + source_str_upgrade = f"upgrade '{upgrade_name}' (Upgrade Number:{upgrade_count})" for option_count, option in enumerate(upgrade["options"]): option_name = option.get("option", "") - source_str_option = ( - source_str_upgrade - + f", option '{option_name}' (Option Number:{option_count})" - ) + source_str_option = source_str_upgrade + f", option '{option_name}' (Option Number:{option_count})" if "apply_logic" in option: if problems := get_logic_problems(option["apply_logic"]): - all_problems.append( - (source_str_option, problems, option["apply_logic"]) - ) + all_problems.append((source_str_option, problems, option["apply_logic"])) if "package_apply_logic" in upgrade: source_str_package = source_str_upgrade + ", in package_apply_logic" @@ -783,11 +670,7 @@ def get_logic_problems(logic, parent=None): # TODO: refactor this into Sampler.validate_args if "downselect" in cfg or "downselect" in cfg.get("sampler", {}).get("type"): source_str = "in downselect logic" - logic = ( - cfg["downselect"]["logic"] - if "downselect" in cfg - else cfg["sampler"]["args"]["logic"] - ) + logic = cfg["downselect"]["logic"] if "downselect" in cfg else cfg["sampler"]["args"]["logic"] if problems := get_logic_problems(logic): all_problems.append((source_str, problems, logic)) @@ -835,10 +718,7 @@ def get_errors(source_str, measure_str): """ if measure_str not in measure_dirs: closest = difflib.get_close_matches(measure_str, list(measure_dirs)) - return ( - f"Measure directory {measure_str} not found. Closest matches: {closest}" - f" {source_str}\n" - ) + return f"Measure directory {measure_str} not found. Closest matches: {closest}" f" {source_str}\n" return "" source_measures_str_list = [] @@ -855,9 +735,7 @@ def get_errors(source_str, measure_str): if not error_message: return True else: - error_message = ( - "Measure name(s)/directory(ies) is(are) invalid. \n" + error_message - ) + error_message = "Measure name(s)/directory(ies) is(are) invalid. \n" + error_message logger.error(error_message) raise ValidationError(error_message) @@ -900,9 +778,7 @@ def validate_resstock_or_comstock_version(project_file): """ cfg = get_project_configuration(project_file) - buildstock_rb = os.path.join( - cfg["buildstock_directory"], "resources/buildstock.rb" - ) + buildstock_rb = os.path.join(cfg["buildstock_directory"], "resources/buildstock.rb") if os.path.exists(buildstock_rb): with open(buildstock_rb, "r") as f: versions = dict( @@ -939,9 +815,7 @@ def validate_number_of_options(project_file): :rtype: bool """ cfg = get_project_configuration(project_file) - measure_xml_filename = os.path.join( - cfg["buildstock_directory"], "measures", "ApplyUpgrade", "measure.xml" - ) + measure_xml_filename = os.path.join(cfg["buildstock_directory"], "measures", "ApplyUpgrade", "measure.xml") if os.path.exists(measure_xml_filename): measure_xml_tree = objectify.parse(measure_xml_filename) measure_xml = measure_xml_tree.getroot() @@ -952,14 +826,10 @@ def validate_number_of_options(project_file): if m_option: option_number = int(m_option.group(1)) n_options_in_measure = max(option_number, n_options_in_measure) - m_costs = re.match( - r"^option_(\d+)_cost_(\d+)_value", str(argument.name) - ) + m_costs = re.match(r"^option_(\d+)_cost_(\d+)_value", str(argument.name)) if m_costs: cost_number = int(m_costs.group(2)) - n_costs_per_option_in_measure = max( - cost_number, n_costs_per_option_in_measure - ) + n_costs_per_option_in_measure = max(cost_number, n_costs_per_option_in_measure) n_options_in_cfg = 0 n_costs_in_cfg = 0 for upgrade in cfg.get("upgrades", []): @@ -1038,22 +908,14 @@ def process_results(self, skip_combine=False, use_dask_cluster=True): wfg_args = self.cfg["workflow_generator"].get("args", {}) if self.cfg["workflow_generator"]["type"] == "residential_hpxml": if "simulation_output_report" in wfg_args.keys(): - if ( - "timeseries_frequency" - in wfg_args["simulation_output_report"].keys() - ): - do_timeseries = ( - wfg_args["simulation_output_report"]["timeseries_frequency"] - != "none" - ) + if "timeseries_frequency" in wfg_args["simulation_output_report"].keys(): + do_timeseries = wfg_args["simulation_output_report"]["timeseries_frequency"] != "none" else: do_timeseries = "timeseries_csv_export" in wfg_args.keys() fs = self.get_fs() if not skip_combine: - postprocessing.combine_results( - fs, self.results_dir, self.cfg, do_timeseries=do_timeseries - ) + postprocessing.combine_results(fs, self.results_dir, self.cfg, do_timeseries=do_timeseries) aws_conf = self.cfg.get("postprocessing", {}).get("aws", {}) if "s3" in aws_conf or "aws" in self.cfg: diff --git a/buildstockbatch/eagle.py b/buildstockbatch/eagle.py index 9245bce6..34eb5725 100644 --- a/buildstockbatch/eagle.py +++ b/buildstockbatch/eagle.py @@ -78,9 +78,7 @@ def __init__(self, project_filename): logger.debug("Output directory = {}".format(output_dir)) weather_dir = self.weather_dir # noqa E841 - self.singularity_image = self.get_singularity_image( - self.cfg, self.os_version, self.os_sha - ) + self.singularity_image = self.get_singularity_image(self.cfg, self.os_version, self.os_sha) @classmethod def validate_project(cls, project_file): @@ -97,8 +95,7 @@ def validate_output_directory_eagle(cls, project_file): output_dir = path_rel_to_file(project_file, cfg["output_directory"]) if not re.match(r"/(lustre/eaglefs/)?(scratch|projects)", output_dir): raise ValidationError( - f"`output_directory` must be in /scratch or /projects," - f" `output_directory` = {output_dir}" + f"`output_directory` must be in /scratch or /projects," f" `output_directory` = {output_dir}" ) @classmethod @@ -110,15 +107,11 @@ def validate_singularity_image_eagle(cls, project_file): cfg.get("os_sha", cls.DEFAULT_OS_SHA), ) if not os.path.exists(singularity_image): - raise ValidationError( - f"The singularity image does not exist: {singularity_image}" - ) + raise ValidationError(f"The singularity image does not exist: {singularity_image}") @property def output_dir(self): - output_dir = path_rel_to_file( - self.project_filename, self.cfg["output_directory"] - ) + output_dir = path_rel_to_file(self.project_filename, self.cfg["output_directory"]) return output_dir @property @@ -137,9 +130,7 @@ def clear_and_copy_dir(src, dst): def get_singularity_image(cls, cfg, os_version, os_sha): return os.path.join( cfg.get("sys_image_dir", cls.DEFAULT_SYS_IMAGE_DIR), - "OpenStudio-{ver}.{sha}-Singularity.simg".format( - ver=os_version, sha=os_sha - ), + "OpenStudio-{ver}.{sha}-Singularity.simg".format(ver=os_version, sha=os_sha), ) @property @@ -153,12 +144,7 @@ def weather_dir(self): def run_batch(self, sampling_only=False): # Create simulation_output dir - sim_out_ts_dir = ( - pathlib.Path(self.output_dir) - / "results" - / "simulation_output" - / "timeseries" - ) + sim_out_ts_dir = pathlib.Path(self.output_dir) / "results" / "simulation_output" / "timeseries" os.makedirs(sim_out_ts_dir, exist_ok=True) for i in range(0, len(self.cfg.get("upgrades", [])) + 1): os.makedirs(sim_out_ts_dir / f"up{i:02d}") @@ -168,9 +154,7 @@ def run_batch(self, sampling_only=False): destination_dir = os.path.dirname(self.sampler.csv_path) if os.path.exists(destination_dir): shutil.rmtree(destination_dir) - shutil.copytree( - os.path.join(self.project_dir, "housing_characteristics"), destination_dir - ) + shutil.copytree(os.path.join(self.project_dir, "housing_characteristics"), destination_dir) logger.debug("Housing characteristics copied.") # run sampling @@ -200,9 +184,7 @@ def run_batch(self, sampling_only=False): # larger than we need, now that we know n_sims n_sims_per_job = max(n_sims_per_job, self.min_sims_per_job) - upgrade_sims = itertools.product( - building_ids, range(len(self.cfg.get("upgrades", []))) - ) + upgrade_sims = itertools.product(building_ids, range(len(self.cfg.get("upgrades", [])))) if not self.skip_baseline_sims: # create batches of simulations baseline_sims = zip(building_ids, itertools.repeat(None)) @@ -217,9 +199,7 @@ def run_batch(self, sampling_only=False): if not batch: break logger.info("Queueing job {} ({} simulations)".format(i, len(batch))) - job_json_filename = os.path.join( - self.output_dir, "job{:03d}.json".format(i) - ) + job_json_filename = os.path.join(self.output_dir, "job{:03d}.json".format(i)) with open(job_json_filename, "w") as f: json.dump( { @@ -248,9 +228,7 @@ def run_job_batch(self, job_array_number): pathlib.Path(self.buildstock_dir) / "measures", self.local_buildstock_dir / "measures", ) - if os.path.exists( - pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures" - ): + if os.path.exists(pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures"): self.clear_and_copy_dir( pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures", self.local_buildstock_dir / "resources/hpxml-measures", @@ -265,9 +243,7 @@ def run_job_batch(self, job_array_number): shutil.copy2(self.singularity_image, self.local_singularity_img) # Run the job batch as normal - job_json_filename = os.path.join( - self.output_dir, "job{:03d}.json".format(job_array_number) - ) + job_json_filename = os.path.join(self.output_dir, "job{:03d}.json".format(job_array_number)) with open(job_json_filename, "r") as f: args = json.load(f) @@ -285,18 +261,12 @@ def run_job_batch(self, job_array_number): df.to_csv(buildstock_csv_path, index=False) logger.debug(f"Buildstock.csv trimmed to {len(df)} rows.") - traceback_file_path = ( - self.local_output_dir - / "simulation_output" - / f"traceback{job_array_number}.out" - ) + traceback_file_path = self.local_output_dir / "simulation_output" / f"traceback{job_array_number}.out" @delayed def run_building_d(i, upgrade_idx): try: - return self.run_building( - self.output_dir, self.cfg, args["n_datapoints"], i, upgrade_idx - ) + return self.run_building(self.output_dir, self.cfg, args["n_datapoints"], i, upgrade_idx) except Exception: with open(traceback_file_path, "a") as f: txt = get_error_details() @@ -323,9 +293,7 @@ def run_building_d(i, upgrade_idx): # Compress simulation results if self.cfg.get("max_minutes_per_sim") is not None: time.sleep(60) # Allow results JSON to finish writing - simout_filename = ( - lustre_sim_out_dir / f"simulations_job{job_array_number}.tar.gz" - ) + simout_filename = lustre_sim_out_dir / f"simulations_job{job_array_number}.tar.gz" logger.info(f"Compressing simulation outputs to {simout_filename}") local_sim_out_dir = self.local_output_dir / "simulation_output" subprocess.run( @@ -355,16 +323,12 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1 try: - sim_id, sim_dir = cls.make_sim_dir( - i, upgrade_idx, os.path.join(cls.local_output_dir, "simulation_output") - ) + sim_id, sim_dir = cls.make_sim_dir(i, upgrade_idx, os.path.join(cls.local_output_dir, "simulation_output")) except SimulationExists as ex: sim_dir = ex.sim_dir else: # Generate the osw for this simulation - osw = cls.create_osw( - cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx - ) + osw = cls.create_osw(cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx) with open(os.path.join(sim_dir, "in.osw"), "w") as f: json.dump(osw, f, indent=4) @@ -375,9 +339,7 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): ] # Create a temporary directory for the simulation to use - with tempfile.TemporaryDirectory( - dir=cls.local_scratch, prefix=f"{sim_id}_" - ) as tmpdir: + with tempfile.TemporaryDirectory(dir=cls.local_scratch, prefix=f"{sim_id}_") as tmpdir: # Build the command to instantiate and configure the singularity container the simulation is run inside local_resources_dir = cls.local_buildstock_dir / "resources" @@ -401,24 +363,12 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): for src in dirs_to_mount: container_mount = "/" + os.path.basename(src) args.extend(["-B", "{}:{}:ro".format(src, container_mount)]) - container_symlink = os.path.join( - "/var/simdata/openstudio", os.path.basename(src) - ) - runscript.append( - "ln -s {} {}".format( - *map(shlex.quote, (container_mount, container_symlink)) - ) - ) + container_symlink = os.path.join("/var/simdata/openstudio", os.path.basename(src)) + runscript.append("ln -s {} {}".format(*map(shlex.quote, (container_mount, container_symlink)))) - if os.path.exists( - os.path.join(cls.local_buildstock_dir, "resources/hpxml-measures") - ): - runscript.append( - "ln -s /resources /var/simdata/openstudio/resources" - ) - src = os.path.join( - cls.local_buildstock_dir, "resources/hpxml-measures" - ) + if os.path.exists(os.path.join(cls.local_buildstock_dir, "resources/hpxml-measures")): + runscript.append("ln -s /resources /var/simdata/openstudio/resources") + src = os.path.join(cls.local_buildstock_dir, "resources/hpxml-measures") container_mount = "/resources/hpxml-measures" args.extend(["-B", "{}:{}:ro".format(src, container_mount)]) @@ -470,30 +420,18 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): "timeout": msg, } out_osw.write(json.dumps(out_msg, indent=3)) - with open( - os.path.join(sim_dir, "run", "out.osw"), "a" - ) as run_log: - run_log.write( - f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}" - ) - with open( - os.path.join(sim_dir, "run", "failed.job"), "w" - ) as failed_job: - failed_job.write( - f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}" - ) - time.sleep( - 60 - ) # Wait for EnergyPlus to release file locks and data_point.zip to finish + with open(os.path.join(sim_dir, "run", "out.osw"), "a") as run_log: + run_log.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") + with open(os.path.join(sim_dir, "run", "failed.job"), "w") as failed_job: + failed_job.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") + time.sleep(60) # Wait for EnergyPlus to release file locks and data_point.zip to finish except subprocess.CalledProcessError: pass finally: # Clean up the symbolic links we created in the container for mount_dir in dirs_to_mount + [os.path.join(sim_dir, "lib")]: try: - os.unlink( - os.path.join(sim_dir, os.path.basename(mount_dir)) - ) + os.unlink(os.path.join(sim_dir, os.path.basename(mount_dir))) except FileNotFoundError: pass @@ -507,9 +445,7 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): ) reporting_measures = cls.get_reporting_measures(cfg) - dpout = postprocessing.read_simulation_outputs( - fs, reporting_measures, sim_dir, upgrade_id, i - ) + dpout = postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, i) return dpout def queue_jobs(self, array_ids=None, hipri=False): @@ -537,9 +473,7 @@ def queue_jobs(self, array_ids=None, hipri=False): # Estimate the wall time in minutes cores_per_node = 36 minutes_per_sim = eagle_cfg["minutes_per_sim"] - walltime = math.ceil( - math.ceil(n_sims_per_job / cores_per_node) * minutes_per_sim - ) + walltime = math.ceil(math.ceil(n_sims_per_job / cores_per_node) * minutes_per_sim) # Queue up simulations here = os.path.dirname(os.path.abspath(__file__)) @@ -590,14 +524,10 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False) # Configuration values account = self.cfg["eagle"]["account"] walltime = self.cfg["eagle"].get("postprocessing", {}).get("time", "1:30:00") - memory = ( - self.cfg["eagle"].get("postprocessing", {}).get("node_memory_mb", 85248) - ) + memory = self.cfg["eagle"].get("postprocessing", {}).get("node_memory_mb", 85248) n_procs = self.cfg["eagle"].get("postprocessing", {}).get("n_procs", 18) n_workers = self.cfg["eagle"].get("postprocessing", {}).get("n_workers", 2) - print( - f"Submitting job to {n_workers} {memory}MB memory nodes using {n_procs} cores in each." - ) + print(f"Submitting job to {n_workers} {memory}MB memory nodes using {n_procs} cores in each.") # Throw an error if the files already exist. if not upload_only: @@ -620,8 +550,7 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False) last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath)) shutil.move( filepath, - filepath.parent - / f"{filepath.stem}_{last_mod_date:%Y%m%d%H%M}{filepath.suffix}", + filepath.parent / f"{filepath.stem}_{last_mod_date:%Y%m%d%H%M}{filepath.suffix}", ) env = {} @@ -674,22 +603,14 @@ def get_dask_client(self): cluster = LocalCluster(local_directory="/data/dask-tmp") return Client(cluster) else: - return Client( - scheduler_file=os.path.join(self.output_dir, "dask_scheduler.json") - ) + return Client(scheduler_file=os.path.join(self.output_dir, "dask_scheduler.json")) def process_results(self, *args, **kwargs): # Check that all the jobs succeeded before proceeding failed_job_array_ids = self.get_failed_job_array_ids() if failed_job_array_ids: - logger.error( - "The following simulation jobs failed: {}".format( - ", ".join(map(str, failed_job_array_ids)) - ) - ) - logger.error( - "Please inspect those jobs and fix any problems before resubmitting." - ) + logger.error("The following simulation jobs failed: {}".format(", ".join(map(str, failed_job_array_ids)))) + logger.error("Please inspect those jobs and fix any problems before resubmitting.") logger.critical("Postprocessing cancelled.") return False @@ -741,8 +662,7 @@ def rerun_failed_jobs(self, hipri=False): last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath)) shutil.move( filepath, - prev_failed_job_out_dir - / f"{filepath.name}_{last_mod_date:%Y%m%d%H%M}", + prev_failed_job_out_dir / f"{filepath.name}_{last_mod_date:%Y%m%d%H%M}", ) # Delete simulation results for jobs we're about to rerun @@ -831,21 +751,15 @@ def user_cli(argv=sys.argv[1:]): help="Only validate the project YAML file and references. Nothing is executed", action="store_true", ) - group.add_argument( - "--samplingonly", help="Run the sampling only.", action="store_true" - ) - group.add_argument( - "--rerun_failed", help="Rerun the failed jobs", action="store_true" - ) + group.add_argument("--samplingonly", help="Run the sampling only.", action="store_true") + group.add_argument("--rerun_failed", help="Rerun the failed jobs", action="store_true") # parse CLI arguments args = parser.parse_args(argv) # load the yaml project file if not os.path.isfile(args.project_filename): - raise FileNotFoundError( - "The project file {} doesn't exist".format(args.project_filename) - ) + raise FileNotFoundError("The project file {} doesn't exist".format(args.project_filename)) project_filename = os.path.abspath(args.project_filename) with open(project_filename, "r") as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) @@ -873,9 +787,7 @@ def user_cli(argv=sys.argv[1:]): out_dir = cfg["output_directory"] if os.path.exists(out_dir): raise FileExistsError( - "The output directory {} already exists. Please delete it or choose another.".format( - out_dir - ) + "The output directory {} already exists. Please delete it or choose another.".format(out_dir) ) logger.info("Creating output directory {}".format(out_dir)) os.makedirs(out_dir) diff --git a/buildstockbatch/local.py b/buildstockbatch/local.py index 70016b6d..22b2dc74 100644 --- a/buildstockbatch/local.py +++ b/buildstockbatch/local.py @@ -47,9 +47,7 @@ def __init__(self, project_filename): self._weather_dir = None # Create simulation_output dir - sim_out_ts_dir = os.path.join( - self.results_dir, "simulation_output", "timeseries" - ) + sim_out_ts_dir = os.path.join(self.results_dir, "simulation_output", "timeseries") os.makedirs(sim_out_ts_dir, exist_ok=True) for i in range(0, len(self.cfg.get("upgrades", [])) + 1): os.makedirs(os.path.join(sim_out_ts_dir, f"up{i:02d}"), exist_ok=True) @@ -58,26 +56,18 @@ def __init__(self, project_filename): # FIXME: Get working without docker if self.cfg.get("baseline", dict()).get("custom_gems", False): # TODO: Fix this stuff to work without docker - logger.info( - "Installing custom gems to docker volume: buildstockbatch_custom_gems" - ) + logger.info("Installing custom gems to docker volume: buildstockbatch_custom_gems") docker_client = docker.client.from_env() # Create a volume to store the custom gems - docker_client.volumes.create( - name="buildstockbatch_custom_gems", driver="local" - ) - simdata_vol = docker_client.volumes.create( - name="buildstockbatch_simdata_temp", driver="local" - ) + docker_client.volumes.create(name="buildstockbatch_custom_gems", driver="local") + simdata_vol = docker_client.volumes.create(name="buildstockbatch_simdata_temp", driver="local") # Define directories to be mounted in the container mnt_gem_dir = "/var/oscli/gems" # Install custom gems to be used in the docker container - local_gemfile_path = os.path.join( - self.buildstock_dir, "resources", "Gemfile" - ) + local_gemfile_path = os.path.join(self.buildstock_dir, "resources", "Gemfile") mnt_gemfile_path_orig = "/var/oscli/gemfile/Gemfile" docker_volume_mounts = { "buildstockbatch_custom_gems": {"bind": mnt_gem_dir, "mode": "rw"}, @@ -88,14 +78,10 @@ def __init__(self, project_filename): # Check that the Gemfile exists if not os.path.exists(local_gemfile_path): print(f"local_gemfile_path = {local_gemfile_path}") - raise AttributeError( - "baseline:custom_gems = True, but did not find Gemfile in /resources directory" - ) + raise AttributeError("baseline:custom_gems = True, but did not find Gemfile in /resources directory") # Make the buildstock/resources/.custom_gems dir to store logs - local_log_dir = os.path.join( - self.buildstock_dir, "resources", ".custom_gems" - ) + local_log_dir = os.path.join(self.buildstock_dir, "resources", ".custom_gems") if not os.path.exists(local_log_dir): os.makedirs(local_log_dir) @@ -110,9 +96,7 @@ def __init__(self, project_filename): volumes=docker_volume_mounts, name="install_custom_gems", ) - with open( - os.path.join(local_log_dir, "bundle_install_output.log"), "wb" - ) as f_out: + with open(os.path.join(local_log_dir, "bundle_install_output.log"), "wb") as f_out: f_out.write(container_output) # Report out custom gems loaded by OpenStudio CLI @@ -162,33 +146,25 @@ def run_building( upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1 try: - sim_id, sim_dir = cls.make_sim_dir( - i, upgrade_idx, os.path.join(results_dir, "simulation_output") - ) + sim_id, sim_dir = cls.make_sim_dir(i, upgrade_idx, os.path.join(results_dir, "simulation_output")) except SimulationExists: return sim_path = pathlib.Path(sim_dir) buildstock_path = pathlib.Path(buildstock_dir) # Make symlinks to project and buildstock stuff - (sim_path / "measures").symlink_to( - buildstock_path / "measures", target_is_directory=True - ) + (sim_path / "measures").symlink_to(buildstock_path / "measures", target_is_directory=True) (sim_path / "lib").symlink_to(buildstock_path / "lib", target_is_directory=True) (sim_path / "weather").symlink_to(weather_dir, target_is_directory=True) hpxml_measures_path = buildstock_path / "resources" / "hpxml-measures" if hpxml_measures_path.exists(): resources_path = sim_path / "resources" resources_path.mkdir() - (resources_path / "hpxml-measures").symlink_to( - hpxml_measures_path, target_is_directory=True - ) + (resources_path / "hpxml-measures").symlink_to(hpxml_measures_path, target_is_directory=True) else: resources_path = None - osw = cls.create_osw( - cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx - ) + osw = cls.create_osw(cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx) with open(sim_path / "in.osw", "w") as f: json.dump(osw, f, indent=4) @@ -278,9 +254,7 @@ def run_building( # Read data_point_out.json reporting_measures = cls.get_reporting_measures(cfg) - dpout = postprocessing.read_simulation_outputs( - fs, reporting_measures, sim_dir, upgrade_id, i - ) + dpout = postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, i) return dpout def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): @@ -319,9 +293,7 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): ) upgrade_sims = [] for i in range(len(self.cfg.get("upgrades", []))): - upgrade_sims.append( - map(functools.partial(run_building_d, upgrade_idx=i), building_ids) - ) + upgrade_sims.append(map(functools.partial(run_building_d, upgrade_idx=i), building_ids)) if not self.skip_baseline_sims: baseline_sims = map(run_building_d, building_ids) all_sims = itertools.chain(baseline_sims, *upgrade_sims) @@ -355,18 +327,14 @@ def output_dir(self): @property def results_dir(self): - results_dir = self.cfg.get( - "output_directory", os.path.join(self.project_dir, "localResults") - ) + results_dir = self.cfg.get("output_directory", os.path.join(self.project_dir, "localResults")) results_dir = self.path_rel_to_projectfile(results_dir) if not os.path.isdir(results_dir): os.makedirs(results_dir) return results_dir def get_dask_client(self): - cluster = LocalCluster( - local_directory=os.path.join(self.results_dir, "dask-tmp") - ) + cluster = LocalCluster(local_directory=os.path.join(self.results_dir, "dask-tmp")) return Client(cluster) @@ -427,8 +395,7 @@ def main(): ) group.add_argument( "--uploadonly", - help="Only upload to S3, useful when postprocessing is already done. Ignores the " - "upload flag in yaml", + help="Only upload to S3, useful when postprocessing is already done. Ignores the " "upload flag in yaml", action="store_true", ) group.add_argument( @@ -436,14 +403,10 @@ def main(): help="Only validate the project YAML file and references. Nothing is executed", action="store_true", ) - group.add_argument( - "--samplingonly", help="Run the sampling only.", action="store_true" - ) + group.add_argument("--samplingonly", help="Run the sampling only.", action="store_true") args = parser.parse_args() if not os.path.isfile(args.project_filename): - raise FileNotFoundError( - f"The project file {args.project_filename} doesn't exist" - ) + raise FileNotFoundError(f"The project file {args.project_filename} doesn't exist") # Validate the project, and in case of the --validateonly flag return True if validation passes LocalBatch.validate_project(args.project_filename) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index df8c2f27..e195bc41 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -132,9 +132,7 @@ def read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, buildin :return: dpout [dict] """ - dpout = read_data_point_out_json( - fs, reporting_measures, f"{sim_dir}/run/data_point_out.json" - ) + dpout = read_data_point_out_json(fs, reporting_measures, f"{sim_dir}/run/data_point_out.json") if dpout is None: dpout = {} else: @@ -167,18 +165,9 @@ def clean_up_results_df(df, cfg, keep_upgrade_id=False): for col in ("started_at", "completed_at"): if col in results_df.columns: results_df[col] = results_df[col].map( - lambda x: ( - dt.datetime.strptime(x, "%Y%m%dT%H%M%SZ") - if isinstance(x, str) - else x - ) + lambda x: (dt.datetime.strptime(x, "%Y%m%dT%H%M%SZ") if isinstance(x, str) else x) ) - reference_scenarios = dict( - [ - (i, x.get("reference_scenario")) - for i, x in enumerate(cfg.get("upgrades", []), 1) - ] - ) + reference_scenarios = dict([(i, x.get("reference_scenario")) for i, x in enumerate(cfg.get("upgrades", []), 1)]) results_df["apply_upgrade.reference_scenario"] = ( results_df["upgrade"].map(reference_scenarios).fillna("").astype(str) ) @@ -198,26 +187,10 @@ def clean_up_results_df(df, cfg, keep_upgrade_id=False): if "job_id" in results_df.columns: first_few_cols.insert(2, "job_id") - build_existing_model_cols = sorted( - [col for col in results_df.columns if col.startswith("build_existing_model")] - ) - sim_output_report_cols = sorted( - [ - col - for col in results_df.columns - if col.startswith("simulation_output_report") - ] - ) - report_sim_output_cols = sorted( - [ - col - for col in results_df.columns - if col.startswith("report_simulation_output") - ] - ) - upgrade_costs_cols = sorted( - [col for col in results_df.columns if col.startswith("upgrade_costs")] - ) + build_existing_model_cols = sorted([col for col in results_df.columns if col.startswith("build_existing_model")]) + sim_output_report_cols = sorted([col for col in results_df.columns if col.startswith("simulation_output_report")]) + report_sim_output_cols = sorted([col for col in results_df.columns if col.startswith("report_simulation_output")]) + upgrade_costs_cols = sorted([col for col in results_df.columns if col.startswith("upgrade_costs")]) sorted_cols = ( first_few_cols + build_existing_model_cols @@ -283,9 +256,7 @@ def read_enduse_timeseries_parquet(fs, all_cols, src_path, bldg_id): return df -def concat_and_normalize( - fs, all_cols, src_path, dst_path, partition_columns, indx, bldg_ids, partition_vals -): +def concat_and_normalize(fs, all_cols, src_path, dst_path, partition_columns, indx, bldg_ids, partition_vals): dfs = [] for bldg_id in sorted(bldg_ids): df = read_enduse_timeseries_parquet(fs, all_cols, src_path, bldg_id) @@ -359,22 +330,12 @@ def get_partitioned_bldg_groups(partition_df, partition_columns, files_per_parti """ total_building = len(partition_df) if partition_columns: - bldg_id_list_df = ( - partition_df.reset_index() - .groupby(partition_columns)["building_id"] - .apply(list) - ) + bldg_id_list_df = partition_df.reset_index().groupby(partition_columns)["building_id"].apply(list) ngroups = len(bldg_id_list_df) bldg_id_list = bldg_id_list_df.sum() - nfiles_in_each_group = [ - nfiles for nfiles in bldg_id_list_df.map(lambda x: len(x)) - ] - files_groups = [ - split_into_groups(n, files_per_partition) for n in nfiles_in_each_group - ] - flat_groups = [ - n for group in files_groups for n in group - ] # flatten list of list into a list (maintain order) + nfiles_in_each_group = [nfiles for nfiles in bldg_id_list_df.map(lambda x: len(x))] + files_groups = [split_into_groups(n, files_per_partition) for n in nfiles_in_each_group] + flat_groups = [n for group in files_groups for n in group] # flatten list of list into a list (maintain order) else: # no partitioning by a column. Just put buildings into groups of files_per_partition ngroups = 1 @@ -414,9 +375,7 @@ def write_metadata_files(fs, parquet_root_dir, partition_columns): concat_files = fs.glob(glob_str) logger.info(f"Gathered {len(concat_files)} files. Now writing _metadata") parquet_root_dir = Path(parquet_root_dir).as_posix() - create_metadata_file( - concat_files, root_dir=parquet_root_dir, engine="pyarrow", fs=fs - ) + create_metadata_file(concat_files, root_dir=parquet_root_dir, engine="pyarrow", fs=fs) logger.info(f"_metadata file written to {parquet_root_dir}") @@ -450,9 +409,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): if not results_json_files: raise ValueError("No simulation results found to post-process.") - logger.info( - "Collecting all the columns and datatypes in results_job*.json.gz parquet files." - ) + logger.info("Collecting all the columns and datatypes in results_job*.json.gz parquet files.") all_schema_dict = ( db.from_sequence(results_json_files) .map(partial(get_schema_dict, fs)) @@ -461,13 +418,10 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ) logger.info(f"Got {len(all_schema_dict)} columns") all_results_cols = list(all_schema_dict.keys()) - all_schema_dict = { - to_camelcase(key): value for key, value in all_schema_dict.items() - } + all_schema_dict = {to_camelcase(key): value for key, value in all_schema_dict.items()} logger.info(f"Got this schema: {all_schema_dict}\n") delayed_results_dfs = [ - dask.delayed(partial(read_results_json, fs, all_cols=all_results_cols))(x) - for x in results_json_files + dask.delayed(partial(read_results_json, fs, all_cols=all_results_cols))(x) for x in results_json_files ] results_df = dd.from_delayed(delayed_results_dfs, verify_meta=False) @@ -480,25 +434,15 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ts_filenames = fs.ls(upgrade_folder) if ts_filenames: do_timeseries = True - logger.info( - f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}." - ) + logger.info(f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}.") files_bag = db.from_sequence(ts_filenames, partition_size=100) - all_ts_cols |= ( - files_bag.map(partial(get_cols, fs)) - .fold(lambda x, y: x.union(y)) - .compute() - ) + all_ts_cols |= files_bag.map(partial(get_cols, fs)).fold(lambda x, y: x.union(y)).compute() logger.info("Collected all the columns") else: - logger.info( - f"There are no timeseries files for upgrade {Path(upgrade_folder).name}." - ) + logger.info(f"There are no timeseries files for upgrade {Path(upgrade_folder).name}.") # Sort the columns - all_ts_cols_sorted = ["building_id"] + sorted( - x for x in all_ts_cols if x.startswith("time") - ) + all_ts_cols_sorted = ["building_id"] + sorted(x for x in all_ts_cols if x.startswith("time")) all_ts_cols.difference_update(all_ts_cols_sorted) all_ts_cols_sorted.extend(sorted(x for x in all_ts_cols if not x.endswith("]"))) all_ts_cols.difference_update(all_ts_cols_sorted) @@ -515,9 +459,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): df_partition_columns = [f"build_existing_model.{c}" for c in partition_columns] missing_cols = set(df_partition_columns) - set(all_schema_dict.keys()) if missing_cols: - raise ValueError( - f"The following partitioning columns are not found in results.json: {missing_cols}" - ) + raise ValueError(f"The following partitioning columns are not found in results.json: {missing_cols}") if partition_columns: logger.info(f"The timeseries files will be partitioned by {partition_columns}.") @@ -534,16 +476,12 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): schema = None partition_df = df[df_partition_columns].copy() partition_df.rename( - columns={ - df_c: c for df_c, c in zip(df_partition_columns, partition_columns) - }, + columns={df_c: c for df_c, c in zip(df_partition_columns, partition_columns)}, inplace=True, ) if upgrade_id > 0: # Remove building characteristics for upgrade scenarios. - cols_to_keep = list( - filter(lambda x: not x.startswith("build_existing_model."), df.columns) - ) + cols_to_keep = list(filter(lambda x: not x.startswith("build_existing_model."), df.columns)) df = df[cols_to_keep] null_cols = get_null_cols(df) # If certain column datatype is null (happens when it doesn't have any data), the datatype @@ -552,13 +490,9 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): logger.info(f"Upgrade {upgrade_id} has null cols: {null_cols}") schema, unresolved = correct_schema(all_schema_dict, df) if unresolved: - logger.info( - f"The types for {unresolved} columns couldn't be determined." - ) + logger.info(f"The types for {unresolved} columns couldn't be determined.") else: - logger.info( - "All columns were successfully assigned a datatype based on other upgrades." - ) + logger.info("All columns were successfully assigned a datatype based on other upgrades.") # Write CSV csv_filename = f"{results_csvs_dir}/results_up{upgrade_id:02d}.csv.gz" logger.info(f"Writing {csv_filename}") @@ -575,53 +509,30 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): fs.makedirs(results_parquet_dir) parquet_filename = f"{results_parquet_dir}/results_up{upgrade_id:02d}.parquet" logger.info(f"Writing {parquet_filename}") - write_dataframe_as_parquet( - df.reset_index(), fs, parquet_filename, schema=schema - ) + write_dataframe_as_parquet(df.reset_index(), fs, parquet_filename, schema=schema) if do_timeseries: # Get the names of the timeseries file for each simulation in this upgrade ts_upgrade_path = f"{ts_in_dir}/up{upgrade_id:02d}" - ts_filenames = [ - ts_upgrade_path + ts_filename for ts_filename in fs.ls(ts_upgrade_path) - ] - ts_bldg_ids = [ - int(re.search(r"bldg(\d+).parquet", flname).group(1)) - for flname in ts_filenames - ] + ts_filenames = [ts_upgrade_path + ts_filename for ts_filename in fs.ls(ts_upgrade_path)] + ts_bldg_ids = [int(re.search(r"bldg(\d+).parquet", flname).group(1)) for flname in ts_filenames] if not ts_filenames: - logger.warning( - f"There are no timeseries files for upgrade{upgrade_id}." - ) + logger.warning(f"There are no timeseries files for upgrade{upgrade_id}.") continue - logger.info( - f"There are {len(ts_filenames)} timeseries files for upgrade{upgrade_id}." - ) + logger.info(f"There are {len(ts_filenames)} timeseries files for upgrade{upgrade_id}.") # Calculate the mean and estimate the total memory usage - read_ts_parquet = partial( - read_enduse_timeseries_parquet, fs, all_ts_cols_sorted, ts_upgrade_path - ) - get_ts_mem_usage_d = dask.delayed( - lambda x: read_ts_parquet(x).memory_usage(deep=True).sum() - ) + read_ts_parquet = partial(read_enduse_timeseries_parquet, fs, all_ts_cols_sorted, ts_upgrade_path) + get_ts_mem_usage_d = dask.delayed(lambda x: read_ts_parquet(x).memory_usage(deep=True).sum()) sample_size = min(len(ts_bldg_ids), 36 * 3) - mean_mem = np.mean( - dask.compute( - map(get_ts_mem_usage_d, random.sample(ts_bldg_ids, sample_size)) - )[0] - ) + mean_mem = np.mean(dask.compute(map(get_ts_mem_usage_d, random.sample(ts_bldg_ids, sample_size)))[0]) # Determine how many files should be in each partition and group the files parquet_memory = int( - cfg.get("eagle", {}) - .get("postprocessing", {}) - .get("parquet_memory_mb", MAX_PARQUET_MEMORY) + cfg.get("eagle", {}).get("postprocessing", {}).get("parquet_memory_mb", MAX_PARQUET_MEMORY) ) logger.info(f"Max parquet memory: {parquet_memory} MB") - max_files_per_partition = max( - 1, math.floor(parquet_memory / (mean_mem / 1e6)) - ) + max_files_per_partition = max(1, math.floor(parquet_memory / (mean_mem / 1e6))) partition_df = partition_df.loc[ts_bldg_ids].copy() logger.info(f"partition_df for the upgrade has {len(partition_df)} rows.") bldg_id_groups, bldg_id_list, ngroup = get_partitioned_bldg_groups( @@ -640,9 +551,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ts_out_loc = f"s3://{ts_dir}/upgrade={upgrade_id}" fs.makedirs(ts_out_loc) - logger.info( - f"Created directory {ts_out_loc} for writing. Now concatenating ..." - ) + logger.info(f"Created directory {ts_out_loc} for writing. Now concatenating ...") src_path = f"{ts_in_dir}/up{upgrade_id:02d}" concat_partial = dask.delayed( @@ -656,11 +565,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ) ) partition_vals_list = [ - ( - list(partition_df.loc[bldg_id_list[0]].values) - if partition_columns - else [] - ) + (list(partition_df.loc[bldg_id_list[0]].values) if partition_columns else []) for bldg_id_list in bldg_id_groups ] @@ -680,9 +585,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): f"{results_dir}/dask_combine_report{upgrade_id}.html", ) - logger.info( - f"Finished combining and saving timeseries for upgrade{upgrade_id}." - ) + logger.info(f"Finished combining and saving timeseries for upgrade{upgrade_id}.") logger.info("All aggregation completed. ") if do_timeseries: logger.info("Writing timeseries metadata files") @@ -708,9 +611,7 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): parquet_dir = Path(results_dir).joinpath("parquet") ts_dir = parquet_dir / "timeseries" if not parquet_dir.is_dir(): - logger.error( - f"{parquet_dir} does not exist. Please make sure postprocessing has been done." - ) + logger.error(f"{parquet_dir} does not exist. Please make sure postprocessing has been done.") raise FileNotFoundError(parquet_dir) all_files = [] @@ -722,9 +623,7 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): s3_prefix = aws_conf.get("s3", {}).get("prefix", "").rstrip("/") s3_bucket = aws_conf.get("s3", {}).get("bucket", None) if not (s3_prefix and s3_bucket): - logger.error( - "YAML file missing postprocessing:aws:s3:prefix and/or bucket entry." - ) + logger.error("YAML file missing postprocessing:aws:s3:prefix and/or bucket entry.") return s3_prefix_output = s3_prefix + "/" + output_folder_name + "/" @@ -732,15 +631,11 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): bucket = s3.Bucket(s3_bucket) n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix_output))) if n_existing_files > 0: - logger.error( - f"There are already {n_existing_files} files in the s3 folder {s3_bucket}/{s3_prefix_output}." - ) + logger.error(f"There are already {n_existing_files} files in the s3 folder {s3_bucket}/{s3_prefix_output}.") raise FileExistsError(f"s3://{s3_bucket}/{s3_prefix_output}") def upload_file(filepath, s3key=None): - full_path = ( - filepath if filepath.is_absolute() else parquet_dir.joinpath(filepath) - ) + full_path = filepath if filepath.is_absolute() else parquet_dir.joinpath(filepath) s3 = boto3.resource("s3") bucket = s3.Bucket(s3_bucket) if s3key is None: @@ -760,9 +655,7 @@ def upload_file(filepath, s3key=None): else: logger.warning(f"{buildstock_csv_filename} doesn't exist, can't upload.") dask.compute(tasks) - logger.info( - f"Upload to S3 completed. The files are uploaded to: {s3_bucket}/{s3_prefix_output}" - ) + logger.info(f"Upload to S3 completed. The files are uploaded to: {s3_bucket}/{s3_prefix_output}") return s3_bucket, s3_prefix_output @@ -771,9 +664,7 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): region_name = aws_conf.get("region_name", "us-west-2") db_name = aws_conf.get("athena", {}).get("database_name", None) - role = aws_conf.get("athena", {}).get( - "glue_service_role", "service-role/AWSGlueServiceRole-default" - ) + role = aws_conf.get("athena", {}).get("glue_service_role", "service-role/AWSGlueServiceRole-default") max_crawling_time = aws_conf.get("athena", {}).get("max_crawling_time", 600) assert db_name, "athena:database_name not supplied" @@ -783,17 +674,11 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): s3_path = f"s3://{s3_bucket}/{s3_prefix}" n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix))) if n_existing_files == 0: - logger.warning( - f"There are no files in {s3_path}, Athena tables will not be created as intended" - ) + logger.warning(f"There are no files in {s3_path}, Athena tables will not be created as intended") return glueClient = boto3.client("glue", region_name=region_name) - crawlTarget = { - "S3Targets": [ - {"Path": s3_path, "Exclusions": ["**_metadata", "**_common_metadata"]} - ] - } + crawlTarget = {"S3Targets": [{"Path": s3_path, "Exclusions": ["**_metadata", "**_common_metadata"]}]} crawler_name = db_name + "_" + tbl_prefix tbl_prefix = tbl_prefix + "_" @@ -811,26 +696,18 @@ def create_crawler(): except glueClient.exceptions.AlreadyExistsException: logger.info(f"Deleting existing crawler: {crawler_name}. And creating new one.") glueClient.delete_crawler(Name=crawler_name) - time.sleep( - 1 - ) # A small delay after deleting is required to prevent AlreadyExistsException again + time.sleep(1) # A small delay after deleting is required to prevent AlreadyExistsException again create_crawler() try: - existing_tables = [ - x["Name"] for x in glueClient.get_tables(DatabaseName=db_name)["TableList"] - ] + existing_tables = [x["Name"] for x in glueClient.get_tables(DatabaseName=db_name)["TableList"]] except glueClient.exceptions.EntityNotFoundException: existing_tables = [] to_be_deleted_tables = [x for x in existing_tables if x.startswith(tbl_prefix)] if to_be_deleted_tables: - logger.info( - f"Deleting existing tables in db {db_name}: {to_be_deleted_tables}. And creating new ones." - ) - glueClient.batch_delete_table( - DatabaseName=db_name, TablesToDelete=to_be_deleted_tables - ) + logger.info(f"Deleting existing tables in db {db_name}: {to_be_deleted_tables}. And creating new ones.") + glueClient.batch_delete_table(DatabaseName=db_name, TablesToDelete=to_be_deleted_tables) glueClient.start_crawler(Name=crawler_name) logger.info("Crawler started") @@ -838,9 +715,7 @@ def create_crawler(): t = time.time() while time.time() - t < (3 * max_crawling_time): crawler_state = glueClient.get_crawler(Name=crawler_name)["Crawler"]["State"] - metrics = glueClient.get_crawler_metrics(CrawlerNameList=[crawler_name])[ - "CrawlerMetricsList" - ][0] + metrics = glueClient.get_crawler_metrics(CrawlerNameList=[crawler_name])["CrawlerMetricsList"][0] if is_crawler_running and crawler_state != "RUNNING": is_crawler_running = False logger.info(f"Crawler has completed running. It is {crawler_state}.") diff --git a/buildstockbatch/sampler/base.py b/buildstockbatch/sampler/base.py index 7d460246..8628aa7e 100644 --- a/buildstockbatch/sampler/base.py +++ b/buildstockbatch/sampler/base.py @@ -47,20 +47,14 @@ def __init__(self, parent): :param parent: The BuildStockBatchBase object that owns this sampler. """ - self.parent = weakref.ref( - parent - ) # This removes circular references and allows garbage collection to work. + self.parent = weakref.ref(parent) # This removes circular references and allows garbage collection to work. if self.container_runtime in ( ContainerRuntime.DOCKER, ContainerRuntime.LOCAL_OPENSTUDIO, ): - self.csv_path = os.path.join( - self.project_dir, "housing_characteristics", "buildstock.csv" - ) + self.csv_path = os.path.join(self.project_dir, "housing_characteristics", "buildstock.csv") elif self.container_runtime == ContainerRuntime.SINGULARITY: - self.csv_path = os.path.join( - self.parent().output_dir, "housing_characteristics", "buildstock.csv" - ) + self.csv_path = os.path.join(self.parent().output_dir, "housing_characteristics", "buildstock.csv") else: self.csv_path = None diff --git a/buildstockbatch/sampler/commercial_sobol.py b/buildstockbatch/sampler/commercial_sobol.py index 2cdfbb98..7bd47979 100644 --- a/buildstockbatch/sampler/commercial_sobol.py +++ b/buildstockbatch/sampler/commercial_sobol.py @@ -63,10 +63,7 @@ def validate_args(cls, project_filename, **kw): else: raise ValidationError(f"Unknown argument for sampler: {k}") if len(expected_args) > 0: - raise ValidationError( - "The following sampler arguments are required: " - + ", ".join(expected_args) - ) + raise ValidationError("The following sampler arguments are required: " + ", ".join(expected_args)) return True def run_sampling(self): @@ -88,15 +85,11 @@ def run_sampling(self): for tsv_file in os.listdir(self.buildstock_dir): if ".tsv" in tsv_file: tsv_df = read_csv(os.path.join(self.buildstock_dir, tsv_file), sep="\t") - dependency_columns = [ - item for item in list(tsv_df) if "Dependency=" in item - ] + dependency_columns = [item for item in list(tsv_df) if "Dependency=" in item] tsv_df[dependency_columns] = tsv_df[dependency_columns].astype("str") tsv_hash[tsv_file.replace(".tsv", "")] = tsv_df dependency_hash, attr_order = self._com_order_tsvs(tsv_hash) - sample_matrix = self._com_execute_sobol_sampling( - attr_order.__len__(), sample_number - ) + sample_matrix = self._com_execute_sobol_sampling(attr_order.__len__(), sample_number) csv_path = self.csv_path header = "Building," for item in attr_order: @@ -132,9 +125,7 @@ def _com_execute_sobol_sampling(n_dims, n_samples): :param n_samples: Number of samples to calculate :return: Pandas DataFrame object which contains the low discrepancy result of the sobol algorithm """ - return pd.DataFrame(i4_sobol_generate(n_dims, n_samples, 0)).replace( - 1.0, 0.999999 - ) + return pd.DataFrame(i4_sobol_generate(n_dims, n_samples, 0)).replace(1.0, 0.999999) @staticmethod def _com_order_tsvs(tsv_hash): @@ -147,9 +138,7 @@ def _com_order_tsvs(tsv_hash): dependency_hash = {} for attr in tsv_hash.keys(): dependency_hash[attr] = [ - item.replace("Dependency=", "") - for item in list(tsv_hash[attr]) - if "Dependency=" in item + item.replace("Dependency=", "") for item in list(tsv_hash[attr]) if "Dependency=" in item ] attr_order = [] for attr in dependency_hash.keys(): @@ -171,9 +160,7 @@ def _com_order_tsvs(tsv_hash): elif max_iterations > 0: max_iterations -= 1 else: - raise RuntimeError( - "Unable to resolve the dependency tree within the set iteration limit" - ) + raise RuntimeError("Unable to resolve the dependency tree within the set iteration limit") return dependency_hash, attr_order @staticmethod @@ -207,8 +194,7 @@ def _com_execute_sample( tsv_dist_val = sample_vector[attr_index] for dependency in sample_dependency_hash[attr]: tsv_lkup = tsv_lkup.loc[ - tsv_lkup.loc[:, "Dependency=" + dependency] - == sample_dependency_hash[dependency] + tsv_lkup.loc[:, "Dependency=" + dependency] == sample_dependency_hash[dependency] ] tsv_lkup = tsv_lkup.drop("Dependency=" + dependency, axis=1) if tsv_lkup.shape[0] == 0: @@ -219,17 +205,9 @@ def _com_execute_sample( ) return if tsv_lkup.shape[0] != 1: - raise RuntimeError( - "Unable to reduce tsv for {} to 1 row, index {}".format( - attr, sample_index - ) - ) + raise RuntimeError("Unable to reduce tsv for {} to 1 row, index {}".format(attr, sample_index)) tsv_lkup_cdf = tsv_lkup.values.cumsum() > tsv_dist_val - option_values = [ - item.replace("Option=", "") - for item in list(tsv_lkup) - if "Option=" in item - ] + option_values = [item.replace("Option=", "") for item in list(tsv_lkup) if "Option=" in item] attr_result = list(compress(option_values, tsv_lkup_cdf))[0] sample_dependency_hash[attr] = attr_result result_vector.append(attr_result) diff --git a/buildstockbatch/sampler/downselect.py b/buildstockbatch/sampler/downselect.py index 9390d7a4..72529ff6 100644 --- a/buildstockbatch/sampler/downselect.py +++ b/buildstockbatch/sampler/downselect.py @@ -44,11 +44,7 @@ def __init__(self, parent, n_datapoints, logic, resample=True, **kw): """ super().__init__(parent) self.validate_args( - self.parent().project_filename, - n_datapoints=n_datapoints, - logic=logic, - resample=resample, - **kw + self.parent().project_filename, n_datapoints=n_datapoints, logic=logic, resample=resample, **kw ) self.logic = logic self.resample = resample @@ -71,10 +67,7 @@ def validate_args(cls, project_filename, **kw): else: extra_kw[k] = v if len(expected_args) > 0: - raise ValidationError( - "The following sampler arguments are required: " - + ", ".join(expected_args) - ) + raise ValidationError("The following sampler arguments are required: " + ", ".join(expected_args)) cls.SUB_SAMPLER_CLASS.validate_args(project_filename, **extra_kw) return True @@ -107,31 +100,21 @@ def downselect_logic(cls, df, logic): def run_sampling(self): if self.resample: - logger.debug( - "Performing initial sampling to figure out number of samples for downselect" - ) + logger.debug("Performing initial sampling to figure out number of samples for downselect") n_samples_init = 350000 - init_sampler = self.SUB_SAMPLER_CLASS( - self.parent(), n_datapoints=n_samples_init, **self.sub_kw - ) + init_sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples_init, **self.sub_kw) buildstock_csv_filename = init_sampler.run_sampling() df = read_csv(buildstock_csv_filename, index_col=0, dtype=str) df_new = df[self.downselect_logic(df, self.logic)] downselected_n_samples_init = df_new.shape[0] - n_samples = math.ceil( - self.n_datapoints * n_samples_init / downselected_n_samples_init - ) + n_samples = math.ceil(self.n_datapoints * n_samples_init / downselected_n_samples_init) os.remove(buildstock_csv_filename) del init_sampler else: n_samples = self.n_datapoints - sampler = self.SUB_SAMPLER_CLASS( - self.parent(), n_datapoints=n_samples, **self.sub_kw - ) + sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples, **self.sub_kw) buildstock_csv_filename = sampler.run_sampling() - with gzip.open( - os.path.splitext(buildstock_csv_filename)[0] + "_orig.csv.gz", "wb" - ) as f_out: + with gzip.open(os.path.splitext(buildstock_csv_filename)[0] + "_orig.csv.gz", "wb") as f_out: with open(buildstock_csv_filename, "rb") as f_in: shutil.copyfileobj(f_in, f_out) df = read_csv(buildstock_csv_filename, index_col=0, dtype="str") diff --git a/buildstockbatch/sampler/residential_quota.py b/buildstockbatch/sampler/residential_quota.py index c94f264a..35e97522 100644 --- a/buildstockbatch/sampler/residential_quota.py +++ b/buildstockbatch/sampler/residential_quota.py @@ -51,10 +51,7 @@ def validate_args(cls, project_filename, **kw): else: raise ValidationError(f"Unknown argument for sampler: {k}") if len(expected_args) > 0: - raise ValidationError( - "The following sampler arguments are required: " - + ", ".join(expected_args) - ) + raise ValidationError("The following sampler arguments are required: " + ", ".join(expected_args)) return True def _run_sampling_docker(self): @@ -76,9 +73,7 @@ def _run_sampling_docker(self): "buildstock.csv", ], remove=True, - volumes={ - self.buildstock_dir: {"bind": "/var/simdata/openstudio", "mode": "rw"} - }, + volumes={self.buildstock_dir: {"bind": "/var/simdata/openstudio", "mode": "rw"}}, name="buildstock_sampling", **extra_kws, ) diff --git a/buildstockbatch/test/conftest.py b/buildstockbatch/test/conftest.py index 54a50d37..554c688a 100644 --- a/buildstockbatch/test/conftest.py +++ b/buildstockbatch/test/conftest.py @@ -36,22 +36,14 @@ def _basic_residential_project_file(update_args={}, raw=False): ) # move the job*.json file to appropriate location - if os.path.exists( - os.path.join(output_directory, "simulation_output", "job0.json") - ): + if os.path.exists(os.path.join(output_directory, "simulation_output", "job0.json")): shutil.move( os.path.join(output_directory, "simulation_output", "job0.json"), - os.path.join( - output_directory, "simulation_output", "..", "..", "job0.json" - ), + os.path.join(output_directory, "simulation_output", "..", "..", "job0.json"), ) os.mkdir(os.path.join(output_directory, "housing_characteristics")) - os.mkdir( - os.path.join( - buildstock_directory, project_directory, "housing_characteristics" - ) - ) + os.mkdir(os.path.join(buildstock_directory, project_directory, "housing_characteristics")) cfg = { "buildstock_directory": buildstock_directory, "project_directory": project_directory, diff --git a/buildstockbatch/test/shared_testing_stuff.py b/buildstockbatch/test/shared_testing_stuff.py index 4e33ac43..6edc738f 100644 --- a/buildstockbatch/test/shared_testing_stuff.py +++ b/buildstockbatch/test/shared_testing_stuff.py @@ -9,6 +9,4 @@ pathlib.Path(__file__).resolve().parent.parent.parent.parent / "resstock", ) ) -resstock_required = pytest.mark.skipif( - not resstock_directory.exists(), reason="ResStock checkout is not found" -) +resstock_required = pytest.mark.skipif(not resstock_directory.exists(), reason="ResStock checkout is not found") diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py index 7ae03173..a2a2d6b7 100644 --- a/buildstockbatch/test/test_base.py +++ b/buildstockbatch/test/test_base.py @@ -45,25 +45,16 @@ def test_reference_scenario(basic_residential_project_file): with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( BuildStockBatchBase, "get_dask_client" - ) as get_dask_client_mock, patch.object( - BuildStockBatchBase, "results_dir", results_dir - ): + ) as get_dask_client_mock, patch.object(BuildStockBatchBase, "results_dir", results_dir): bsb = BuildStockBatchBase(project_filename) bsb.process_results() get_dask_client_mock.assert_called_once() # test results.csv files test_path = os.path.join(results_dir, "results_csvs") - test_csv = ( - read_csv(os.path.join(test_path, "results_up01.csv.gz")) - .set_index("building_id") - .sort_index() - ) + test_csv = read_csv(os.path.join(test_path, "results_up01.csv.gz")).set_index("building_id").sort_index() assert len(test_csv["apply_upgrade.reference_scenario"].unique()) == 1 - assert ( - test_csv["apply_upgrade.reference_scenario"].iloc[0] - == "example_reference_scenario" - ) + assert test_csv["apply_upgrade.reference_scenario"].iloc[0] == "example_reference_scenario" def test_downselect_integer_options(basic_residential_project_file, mocker): @@ -80,9 +71,7 @@ def test_downselect_integer_options(basic_residential_project_file, mocker): col_idx = row.index("Days Shifted") else: # Convert values from "Day1" to "1.10" so we hit the bug - row[col_idx] = "{0}.{0}0".format( - re.search(r"Day(\d+)", row[col_idx]).group(1) - ) + row[col_idx] = "{0}.{0}0".format(re.search(r"Day(\d+)", row[col_idx]).group(1)) valid_option_values.add(row[col_idx]) cf_out.writerow(row) @@ -100,9 +89,7 @@ def test_downselect_integer_options(basic_residential_project_file, mocker): ) mocker.patch.object(BuildStockBatchBase, "weather_dir", None) mocker.patch.object(BuildStockBatchBase, "results_dir", results_dir) - sampler_property_mock = mocker.patch.object( - BuildStockBatchBase, "sampler", new_callable=PropertyMock - ) + sampler_property_mock = mocker.patch.object(BuildStockBatchBase, "sampler", new_callable=PropertyMock) sampler_mock = mocker.MagicMock() sampler_property_mock.return_value = sampler_mock sampler_mock.run_sampling = MagicMock(return_value=buildstock_csv) @@ -141,9 +128,7 @@ def test_upload_files(mocked_boto3, basic_residential_project_file): } } mocked_glueclient = MagicMock() - mocked_glueclient.get_crawler = MagicMock( - return_value={"Crawler": {"State": "READY"}} - ) + mocked_glueclient.get_crawler = MagicMock(return_value={"Crawler": {"State": "READY"}}) mocked_boto3.client = MagicMock(return_value=mocked_glueclient) mocked_boto3.resource().Bucket().objects.filter.side_effect = [[], ["a", "b", "c"]] project_filename, results_dir = basic_residential_project_file(upload_config) @@ -155,17 +140,12 @@ def test_upload_files(mocked_boto3, basic_residential_project_file): / "buildstock.csv" ) # noqa: E501 shutil.copy2( - Path(__file__).parent - / "test_results" - / "housing_characteristics" - / "buildstock.csv", + Path(__file__).parent / "test_results" / "housing_characteristics" / "buildstock.csv", buildstock_csv_path, ) with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( BuildStockBatchBase, "output_dir", results_dir - ), patch.object( - BuildStockBatchBase, "get_dask_client" - ) as get_dask_client_mock, patch.object( + ), patch.object(BuildStockBatchBase, "get_dask_client") as get_dask_client_mock, patch.object( BuildStockBatchBase, "results_dir", results_dir ), patch.object( BuildStockBatchBase, "CONTAINER_RUNTIME", ContainerRuntime.LOCAL_OPENSTUDIO @@ -190,25 +170,13 @@ def test_upload_files(mocked_boto3, basic_residential_project_file): if call_function == "create_crawler": crawler_para = call[2] # 2 is for the keyword arguments crawler_created = True - assert ( - crawler_para["DatabaseName"] - == upload_config["postprocessing"]["aws"]["athena"]["database_name"] - ) - assert ( - crawler_para["Role"] - == upload_config["postprocessing"]["aws"]["athena"]["glue_service_role"] - ) + assert crawler_para["DatabaseName"] == upload_config["postprocessing"]["aws"]["athena"]["database_name"] + assert crawler_para["Role"] == upload_config["postprocessing"]["aws"]["athena"]["glue_service_role"] assert crawler_para["TablePrefix"] == OUTPUT_FOLDER_NAME + "_" assert crawler_para["Name"] == db_name + "_" + OUTPUT_FOLDER_NAME assert ( crawler_para["Targets"]["S3Targets"][0]["Path"] - == "s3://" - + s3_bucket - + "/" - + s3_prefix - + "/" - + OUTPUT_FOLDER_NAME - + "/" + == "s3://" + s3_bucket + "/" + s3_prefix + "/" + OUTPUT_FOLDER_NAME + "/" ) if call_function == "start_crawler": assert crawler_created, "crawler attempted to start before creating" @@ -228,23 +196,17 @@ def test_upload_files(mocked_boto3, basic_residential_project_file): files_uploaded.remove((source_file_path, s3_file_path)) s3_file_path = s3_path + "upgrades/upgrade=1/results_up01.parquet" - source_file_path = os.path.join( - source_path, "upgrades", "upgrade=1", "results_up01.parquet" - ) + source_file_path = os.path.join(source_path, "upgrades", "upgrade=1", "results_up01.parquet") assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) s3_file_path = s3_path + "timeseries/upgrade=0/group0.parquet" - source_file_path = os.path.join( - source_path, "timeseries", "upgrade=0", "group0.parquet" - ) + source_file_path = os.path.join(source_path, "timeseries", "upgrade=0", "group0.parquet") assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) s3_file_path = s3_path + "timeseries/upgrade=1/group0.parquet" - source_file_path = os.path.join( - source_path, "timeseries", "upgrade=1", "group0.parquet" - ) + source_file_path = os.path.join(source_path, "timeseries", "upgrade=1", "group0.parquet") assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) @@ -263,9 +225,7 @@ def test_upload_files(mocked_boto3, basic_residential_project_file): assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - assert ( - len(files_uploaded) == 0 - ), f"These files shouldn't have been uploaded: {files_uploaded}" + assert len(files_uploaded) == 0, f"These files shouldn't have been uploaded: {files_uploaded}" def test_write_parquet_no_index(): @@ -286,9 +246,7 @@ def test_skipping_baseline(basic_residential_project_file): ) sim_output_path = os.path.join(results_dir, "simulation_output") - shutil.rmtree( - os.path.join(sim_output_path, "timeseries", "up00") - ) # remove timeseries results for baseline + shutil.rmtree(os.path.join(sim_output_path, "timeseries", "up00")) # remove timeseries results for baseline # remove results.csv data for baseline from results_jobx.json.gz results_json_filename = os.path.join(sim_output_path, "results_job0.json.gz") @@ -308,22 +266,16 @@ def test_skipping_baseline(basic_residential_project_file): # run postprocessing with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( BuildStockBatchBase, "get_dask_client" - ) as get_dask_client_mock, patch.object( - BuildStockBatchBase, "results_dir", results_dir - ): + ) as get_dask_client_mock, patch.object(BuildStockBatchBase, "results_dir", results_dir): bsb = BuildStockBatchBase(project_filename) bsb.process_results() get_dask_client_mock.assert_called_once() - up00_parquet = os.path.join( - results_dir, "parquet", "baseline", "results_up00.parquet" - ) + up00_parquet = os.path.join(results_dir, "parquet", "baseline", "results_up00.parquet") assert not os.path.exists(up00_parquet) - up01_parquet = os.path.join( - results_dir, "parquet", "upgrades", "upgrade=1", "results_up01.parquet" - ) + up01_parquet = os.path.join(results_dir, "parquet", "upgrades", "upgrade=1", "results_up01.parquet") assert os.path.exists(up01_parquet) up00_csv_gz = os.path.join(results_dir, "results_csvs", "results_up00.csv.gz") @@ -346,9 +298,7 @@ def test_provide_buildstock_csv(basic_residential_project_file, mocker): sampling_output_csv = bsb.sampler.run_sampling() df2 = read_csv(sampling_output_csv, dtype=str) pd.testing.assert_frame_equal(df, df2) - assert ( - df["Geometry Shared Walls"] == "None" - ).all() # Verify None is being read properly + assert (df["Geometry Shared Walls"] == "None").all() # Verify None is being read properly # Test file missing with open(project_filename, "r") as f: cfg = yaml.safe_load(f) diff --git a/buildstockbatch/test/test_eagle.py b/buildstockbatch/test/test_eagle.py index c33d8f1e..822e1297 100644 --- a/buildstockbatch/test/test_eagle.py +++ b/buildstockbatch/test/test_eagle.py @@ -19,15 +19,10 @@ def test_hpc_run_building(mock_subprocess, monkeypatch, basic_residential_project_file): tar_filename = ( - pathlib.Path(__file__).resolve().parent - / "test_results" - / "simulation_output" - / "simulations_job0.tar.gz" + pathlib.Path(__file__).resolve().parent / "test_results" / "simulation_output" / "simulations_job0.tar.gz" ) # noqa E501 with tarfile.open(tar_filename, "r") as tarf: - osw_dict = json.loads( - tarf.extractfile("up00/bldg0000001/in.osw").read().decode("utf-8") - ) + osw_dict = json.loads(tarf.extractfile("up00/bldg0000001/in.osw").read().decode("utf-8")) project_filename, results_dir = basic_residential_project_file() tmp_path = pathlib.Path(results_dir).parent @@ -38,9 +33,7 @@ def test_hpc_run_building(mock_subprocess, monkeypatch, basic_residential_projec with patch.object(EagleBatch, "weather_dir", None), patch.object( EagleBatch, "create_osw", return_value=osw_dict - ), patch.object( - EagleBatch, "make_sim_dir", return_value=("bldg0000001up00", sim_path) - ), patch.object( + ), patch.object(EagleBatch, "make_sim_dir", return_value=("bldg0000001up00", sim_path)), patch.object( EagleBatch, "local_scratch", tmp_path ): @@ -123,17 +116,12 @@ def test_user_cli( argv = [project_filename] user_cli(argv) mock_subprocess.run.assert_called_once() - eagle_sh = os.path.abspath( - os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "eagle.sh") - ) + eagle_sh = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "eagle.sh")) assert mock_subprocess.run.call_args[0][0][-1] == eagle_sh assert "--time=20" in mock_subprocess.run.call_args[0][0] assert "--account=testaccount" in mock_subprocess.run.call_args[0][0] assert "--nodes=1" in mock_subprocess.run.call_args[0][0] - assert ( - "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" - in mock_subprocess.run.call_args[0][0] - ) + assert "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" in mock_subprocess.run.call_args[0][0] assert "--output=sampling.out" in mock_subprocess.run.call_args[0][0] assert "--qos=high" not in mock_subprocess.run.call_args[0][0] assert "0" == mock_subprocess.run.call_args[1]["env"]["MEASURESONLY"] @@ -146,10 +134,7 @@ def test_user_cli( assert "--time=20" in mock_subprocess.run.call_args[0][0] assert "--account=testaccount" in mock_subprocess.run.call_args[0][0] assert "--nodes=1" in mock_subprocess.run.call_args[0][0] - assert ( - "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" - in mock_subprocess.run.call_args[0][0] - ) + assert "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" in mock_subprocess.run.call_args[0][0] assert "--output=sampling.out" in mock_subprocess.run.call_args[0][0] assert "--qos=high" in mock_subprocess.run.call_args[0][0] assert "0" == mock_subprocess.run.call_args[1]["env"]["MEASURESONLY"] @@ -163,10 +148,7 @@ def test_user_cli( assert "--time=20" in mock_subprocess.run.call_args[0][0] assert "--account=testaccount" in mock_subprocess.run.call_args[0][0] assert "--nodes=1" in mock_subprocess.run.call_args[0][0] - assert ( - "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" - in mock_subprocess.run.call_args[0][0] - ) + assert "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" in mock_subprocess.run.call_args[0][0] assert "--output=sampling.out" in mock_subprocess.run.call_args[0][0] assert "--qos=high" not in mock_subprocess.run.call_args[0][0] assert "1" == mock_subprocess.run.call_args[1]["env"]["MEASURESONLY"] @@ -180,10 +162,7 @@ def test_user_cli( assert "--time=20" in mock_subprocess.run.call_args[0][0] assert "--account=testaccount" in mock_subprocess.run.call_args[0][0] assert "--nodes=1" in mock_subprocess.run.call_args[0][0] - assert ( - "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" - in mock_subprocess.run.call_args[0][0] - ) + assert "--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY" in mock_subprocess.run.call_args[0][0] assert "--output=sampling.out" in mock_subprocess.run.call_args[0][0] assert "--qos=high" not in mock_subprocess.run.call_args[0][0] assert "1" == mock_subprocess.run.call_args[1]["env"]["SAMPLINGONLY"] @@ -191,9 +170,7 @@ def test_user_cli( @patch("buildstockbatch.eagle.subprocess") -def test_qos_high_job_submit( - mock_subprocess, basic_residential_project_file, monkeypatch -): +def test_qos_high_job_submit(mock_subprocess, basic_residential_project_file, monkeypatch): mock_subprocess.run.return_value.stdout = "Submitted batch job 1\n" mock_subprocess.PIPE = None project_filename, results_dir = basic_residential_project_file() @@ -222,9 +199,7 @@ def test_qos_high_job_submit( assert "--qos=high" in mock_subprocess.run.call_args[0][0] -def test_queue_jobs_minutes_per_sim( - mocker, basic_residential_project_file, monkeypatch -): +def test_queue_jobs_minutes_per_sim(mocker, basic_residential_project_file, monkeypatch): mock_subprocess = mocker.patch("buildstockbatch.eagle.subprocess") mocker.patch.object(EagleBatch, "weather_dir", None) mock_subprocess.run.return_value.stdout = "Submitted batch job 1\n" @@ -272,14 +247,10 @@ def test_run_building_process(mocker, basic_residential_project_file): with open(results_dir / "job001.json", "w") as f: json.dump(job_json, f) - sample_buildstock_csv = pd.DataFrame.from_records( - [{"Building": i, "Dummy Column": i * i} for i in range(10)] - ) + sample_buildstock_csv = pd.DataFrame.from_records([{"Building": i, "Dummy Column": i * i} for i in range(10)]) os.makedirs(results_dir / "housing_characteristics", exist_ok=True) os.makedirs(results_dir / "weather", exist_ok=True) - sample_buildstock_csv.to_csv( - results_dir / "housing_characteristics" / "buildstock.csv", index=False - ) + sample_buildstock_csv.to_csv(results_dir / "housing_characteristics" / "buildstock.csv", index=False) def sequential_parallel(**kwargs): kw2 = kwargs.copy() @@ -290,12 +261,8 @@ def sequential_parallel(**kwargs): mocker.patch("buildstockbatch.eagle.Parallel", sequential_parallel) mocker.patch("buildstockbatch.eagle.subprocess") - mocker.patch.object( - EagleBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir" - ) - mocker.patch.object( - EagleBatch, "local_weather_dir", results_dir / "local_weather_dir" - ) + mocker.patch.object(EagleBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir") + mocker.patch.object(EagleBatch, "local_weather_dir", results_dir / "local_weather_dir") mocker.patch.object(EagleBatch, "local_output_dir", results_dir) mocker.patch.object( EagleBatch, @@ -308,20 +275,14 @@ def sequential_parallel(**kwargs): def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=False): real_upgrade_idx = 0 if upgrade_idx is None else upgrade_idx + 1 sim_id = f"bldg{building_id:07d}up{real_upgrade_idx:02d}" - sim_dir = os.path.join( - base_dir, f"up{real_upgrade_idx:02d}", f"bldg{building_id:07d}" - ) + sim_dir = os.path.join(base_dir, f"up{real_upgrade_idx:02d}", f"bldg{building_id:07d}") return sim_id, sim_dir mocker.patch.object(EagleBatch, "make_sim_dir", make_sim_dir_mock) - sampler_prop_mock = mocker.patch.object( - EagleBatch, "sampler", new_callable=mocker.PropertyMock - ) + sampler_prop_mock = mocker.patch.object(EagleBatch, "sampler", new_callable=mocker.PropertyMock) sampler_mock = mocker.MagicMock() sampler_prop_mock.return_value = sampler_mock - sampler_mock.csv_path = ( - results_dir.parent / "housing_characteristic2" / "buildstock.csv" - ) + sampler_mock.csv_path = results_dir.parent / "housing_characteristic2" / "buildstock.csv" sampler_mock.run_sampling = mocker.MagicMock(return_value="buildstock.csv") b = EagleBatch(project_filename) @@ -330,19 +291,11 @@ def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=Fal b.run_job_batch(1) # check results job-json - refrence_path = ( - pathlib.Path(__file__).resolve().parent / "test_results" / "reference_files" - ) + refrence_path = pathlib.Path(__file__).resolve().parent / "test_results" / "reference_files" - refrence_list = json.loads( - gzip.open(refrence_path / "results_job1.json.gz", "r").read() - ) + refrence_list = json.loads(gzip.open(refrence_path / "results_job1.json.gz", "r").read()) - output_list = json.loads( - gzip.open( - results_dir / "simulation_output" / "results_job1.json.gz", "r" - ).read() - ) + output_list = json.loads(gzip.open(results_dir / "simulation_output" / "results_job1.json.gz", "r").read()) refrence_list = [json.dumps(d) for d in refrence_list] output_list = [json.dumps(d) for d in output_list] @@ -352,35 +305,16 @@ def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=Fal ts_files = list(refrence_path.glob("**/*.parquet")) def compare_ts_parquets(source, dst): - test_pq = ( - pd.read_parquet(source) - .reset_index() - .drop(columns=["index"]) - .rename(columns=str.lower) - ) - reference_pq = ( - pd.read_parquet(dst) - .reset_index() - .drop(columns=["index"]) - .rename(columns=str.lower) - ) + test_pq = pd.read_parquet(source).reset_index().drop(columns=["index"]).rename(columns=str.lower) + reference_pq = pd.read_parquet(dst).reset_index().drop(columns=["index"]).rename(columns=str.lower) pd.testing.assert_frame_equal(test_pq, reference_pq) for file in ts_files: - results_file = ( - results_dir - / "results" - / "simulation_output" - / "timeseries" - / file.parent.name - / file.name - ) + results_file = results_dir / "results" / "simulation_output" / "timeseries" / file.parent.name / file.name compare_ts_parquets(file, results_file) # Check that buildstock.csv was trimmed properly - local_buildstock_df = read_csv( - results_dir / "local_housing_characteristics_dir" / "buildstock.csv", dtype=str - ) + local_buildstock_df = read_csv(results_dir / "local_housing_characteristics_dir" / "buildstock.csv", dtype=str) unique_buildings = {str(x[0]) for x in job_json["batch"]} assert len(unique_buildings) == len(local_buildstock_df) assert unique_buildings == set(local_buildstock_df["Building"]) @@ -395,15 +329,11 @@ def test_run_building_error_caught(mocker, basic_residential_project_file): with open(results_dir / "job001.json", "w") as f: json.dump(job_json, f) - sample_buildstock_csv = pd.DataFrame.from_records( - [{"Building": i, "Dummy Column": i * i} for i in range(10)] - ) + sample_buildstock_csv = pd.DataFrame.from_records([{"Building": i, "Dummy Column": i * i} for i in range(10)]) os.makedirs(results_dir / "housing_characteristics", exist_ok=True) os.makedirs(results_dir / "local_housing_characteristics", exist_ok=True) os.makedirs(results_dir / "weather", exist_ok=True) - sample_buildstock_csv.to_csv( - results_dir / "housing_characteristics" / "buildstock.csv", index=False - ) + sample_buildstock_csv.to_csv(results_dir / "housing_characteristics" / "buildstock.csv", index=False) def raise_error(*args, **kwargs): raise RuntimeError("A problem happened") @@ -420,12 +350,8 @@ def sequential_parallel(**kwargs): mocker.patch.object(EagleBatch, "run_building", raise_error) mocker.patch.object(EagleBatch, "local_output_dir", results_dir) mocker.patch.object(EagleBatch, "results_dir", results_dir) - mocker.patch.object( - EagleBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir" - ) - mocker.patch.object( - EagleBatch, "local_weather_dir", results_dir / "local_weather_dir" - ) + mocker.patch.object(EagleBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir") + mocker.patch.object(EagleBatch, "local_weather_dir", results_dir / "local_weather_dir") mocker.patch.object( EagleBatch, "local_housing_characteristics_dir", @@ -450,9 +376,7 @@ def test_rerun_failed_jobs(mocker, basic_residential_project_file): mocker.patch.object(EagleBatch, "results_dir", results_dir) process_results_mocker = mocker.patch.object(BuildStockBatchBase, "process_results") queue_jobs_mocker = mocker.patch.object(EagleBatch, "queue_jobs", return_value=[42]) - queue_post_processing_mocker = mocker.patch.object( - EagleBatch, "queue_post_processing" - ) + queue_post_processing_mocker = mocker.patch.object(EagleBatch, "queue_post_processing") b = EagleBatch(project_filename) diff --git a/buildstockbatch/test/test_local.py b/buildstockbatch/test/test_local.py index 5bc7d5ce..eec858af 100644 --- a/buildstockbatch/test/test_local.py +++ b/buildstockbatch/test/test_local.py @@ -44,11 +44,7 @@ def test_resstock_local_batch(project_filename): n_datapoints = 2 batch.cfg["sampler"]["args"]["n_datapoints"] = n_datapoints - local_weather_file = ( - resstock_directory.parent - / "weather" - / batch.cfg["weather_files_url"].split("/")[-1] - ) + local_weather_file = resstock_directory.parent / "weather" / batch.cfg["weather_files_url"].split("/")[-1] if local_weather_file.exists(): del batch.cfg["weather_files_url"] batch.cfg["weather_files_path"] = str(local_weather_file) @@ -63,12 +59,7 @@ def test_resstock_local_batch(project_filename): for upgrade_id in range(0, n_upgrades + 1): for bldg_id in range(1, n_datapoints + 1): - assert ( - simout_path - / "timeseries" - / f"up{upgrade_id:02d}" - / f"bldg{bldg_id:07d}.parquet" - ).exists() + assert (simout_path / "timeseries" / f"up{upgrade_id:02d}" / f"bldg{bldg_id:07d}.parquet").exists() batch.process_results() @@ -83,17 +74,9 @@ def test_resstock_local_batch(project_filename): ts_pq_path = out_path / "parquet" / "timeseries" for upgrade_id in range(0, n_upgrades + 1): assert (ts_pq_path / f"upgrade={upgrade_id}" / "group0.parquet").exists() - assert ( - out_path / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz" - ).exists() + assert (out_path / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz").exists() if upgrade_id >= 1: - upg_pq = ( - out_path - / "parquet" - / "upgrades" - / f"upgrade={upgrade_id}" - / f"results_up{upgrade_id:02d}.parquet" - ) + upg_pq = out_path / "parquet" / "upgrades" / f"upgrade={upgrade_id}" / f"results_up{upgrade_id:02d}.parquet" assert upg_pq.exists() upg = pd.read_parquet(upg_pq, columns=["completed_status"]) assert (upg["completed_status"] == "Success").all() @@ -114,9 +97,7 @@ def mocked_subprocess_run(run_cmd, **kwargs): mocker.patch("buildstockbatch.local.subprocess.run", mocked_subprocess_run) sleep_mock = mocker.patch("buildstockbatch.local.time.sleep") - cfg = get_project_configuration( - resstock_directory / "project_national" / "national_baseline.yml" - ) + cfg = get_project_configuration(resstock_directory / "project_national" / "national_baseline.yml") cfg["max_minutes_per_sim"] = 5 with tempfile.TemporaryDirectory() as tmpdir: @@ -145,9 +126,7 @@ def mocked_subprocess_run(run_cmd, **kwargs): assert out_osw["completed_status"] == "Fail" assert msg_re.search(out_osw["timeout"]) - err_log_re = re.compile( - r"\[\d\d:\d\d:\d\d ERROR\] Terminated \w+ after reaching max time" - ) + err_log_re = re.compile(r"\[\d\d:\d\d:\d\d ERROR\] Terminated \w+ after reaching max time") with open(sim_path / "run" / "run.log", "r") as run_log: err_log_re.search(run_log.read()) with open(sim_path / "run" / "failed.job", "r") as failed_job: diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py index d11fdb86..667faa7f 100644 --- a/buildstockbatch/test/test_postprocessing.py +++ b/buildstockbatch/test/test_postprocessing.py @@ -19,9 +19,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): reporting_measures = ["ReportingMeasure1", "ReportingMeasure2"] - project_filename, results_dir = basic_residential_project_file( - {"reporting_measures": reporting_measures} - ) + project_filename, results_dir = basic_residential_project_file({"reporting_measures": reporting_measures}) fs = LocalFileSystem() @@ -42,11 +40,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): sim_dir = str(filename.parent.parent) upgrade_id = int(re.search(r"up(\d+)", sim_dir).group(1)) building_id = int(re.search(r"bldg(\d+)", sim_dir).group(1)) - dpouts2.append( - postprocessing.read_simulation_outputs( - fs, reporting_measures, sim_dir, upgrade_id, building_id - ) - ) + dpouts2.append(postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, building_id)) with gzip.open(sim_out_dir / "results_job0.json.gz", "wt", encoding="utf-8") as f: json.dump(dpouts2, f) @@ -56,9 +50,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False) for upgrade_id in (0, 1): - df = read_csv( - str(results_dir / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz") - ) + df = read_csv(str(results_dir / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz")) assert (df["reporting_measure1.column_1"] == 1).all() assert (df["reporting_measure1.column_2"] == 2).all() assert (df["reporting_measure2.column_3"] == 3).all() @@ -74,9 +66,7 @@ def test_empty_results_assertion(basic_residential_project_file, capsys): shutil.rmtree(sim_out_dir) # no results cfg = get_project_configuration(project_filename) - with pytest.raises( - ValueError, match=r"No simulation results found to post-process" - ): + with pytest.raises(ValueError, match=r"No simulation results found to post-process"): assert postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False) @@ -96,9 +86,7 @@ def test_large_parquet_combine(basic_residential_project_file): @pytest.mark.parametrize("keep_individual_timeseries", [True, False]) -def test_keep_individual_timeseries( - keep_individual_timeseries, basic_residential_project_file, mocker -): +def test_keep_individual_timeseries(keep_individual_timeseries, basic_residential_project_file, mocker): project_filename, results_dir = basic_residential_project_file( {"postprocessing": {"keep_individual_timeseries": keep_individual_timeseries}} ) @@ -122,9 +110,7 @@ def test_upgrade_missing_ts(basic_residential_project_file, mocker, caplog): project_filename, results_dir = basic_residential_project_file() results_path = pathlib.Path(results_dir) - for filename in (results_path / "simulation_output" / "timeseries" / "up01").glob( - "*.parquet" - ): + for filename in (results_path / "simulation_output" / "timeseries" / "up01").glob("*.parquet"): os.remove(filename) mocker.patch.object(BuildStockBatchBase, "weather_dir", None) diff --git a/buildstockbatch/test/test_validation.py b/buildstockbatch/test/test_validation.py index e2c849e6..531f423a 100644 --- a/buildstockbatch/test/test_validation.py +++ b/buildstockbatch/test/test_validation.py @@ -34,9 +34,7 @@ here = os.path.dirname(os.path.abspath(__file__)) example_yml_dir = os.path.join(here, "test_inputs") -resources_dir = os.path.join( - here, "test_inputs", "test_openstudio_buildstock", "resources" -) +resources_dir = os.path.join(here, "test_inputs", "test_openstudio_buildstock", "resources") def filter_logs(logs, level): @@ -69,15 +67,11 @@ def test_aws_batch_validation_is_static(): def test_complete_schema_passes_validation(): - assert BuildStockBatchBase.validate_project_schema( - os.path.join(example_yml_dir, "complete-schema.yml") - ) + assert BuildStockBatchBase.validate_project_schema(os.path.join(example_yml_dir, "complete-schema.yml")) def test_minimal_schema_passes_validation(): - assert BuildStockBatchBase.validate_project_schema( - os.path.join(example_yml_dir, "minimal-schema.yml") - ) + assert BuildStockBatchBase.validate_project_schema(os.path.join(example_yml_dir, "minimal-schema.yml")) @pytest.mark.parametrize( @@ -135,13 +129,9 @@ def test_xor_violations_fail(project_file, expected): ) def test_validation_integration(project_file, base_expected, eagle_expected): # patch the validate_options_lookup function to always return true for this case - with patch.object( - BuildStockBatchBase, "validate_options_lookup", lambda _: True - ), patch.object( + with patch.object(BuildStockBatchBase, "validate_options_lookup", lambda _: True), patch.object( BuildStockBatchBase, "validate_measure_references", lambda _: True - ), patch.object( - BuildStockBatchBase, "validate_workflow_generator", lambda _: True - ), patch.object( + ), patch.object(BuildStockBatchBase, "validate_workflow_generator", lambda _: True), patch.object( BuildStockBatchBase, "validate_postprocessing_spec", lambda _: True ), patch.object( EagleBatch, "validate_singularity_image_eagle", lambda _: True @@ -194,14 +184,10 @@ def test_bad_measures(project_file): except (ValidationError, YamaleError) as er: er = str(er) assert "'1.5' is not a int" in er - assert ( - "'huorly' not in ('none', 'timestep', 'hourly', 'daily', 'monthly')" - in er - ) + assert "'huorly' not in ('none', 'timestep', 'hourly', 'daily', 'monthly')" in er else: raise Exception( - "measures_and_arguments was supposed to raise ValidationError for" - " enforce-validate-measures-bad.yml" + "measures_and_arguments was supposed to raise ValidationError for" " enforce-validate-measures-bad.yml" ) @@ -209,9 +195,7 @@ def test_bad_measures(project_file): "project_file", [ os.path.join(example_yml_dir, "enforce-validate-measures-good-2.yml"), - os.path.join( - example_yml_dir, "enforce-validate-measures-good-2-with-anchors.yml" - ), + os.path.join(example_yml_dir, "enforce-validate-measures-good-2-with-anchors.yml"), ], ) def test_good_measures(project_file): @@ -276,9 +260,7 @@ def test_bad_options_validation(project_file): assert "Floor Insulation: '*' cannot be mixed with other options" in er else: - raise Exception( - "validate_options was supposed to raise ValueError for enforce-validate-options-bad.yml" - ) + raise Exception("validate_options was supposed to raise ValueError for enforce-validate-options-bad.yml") @pytest.mark.parametrize( @@ -309,8 +291,7 @@ def test_bad_measures_validation(project_file): else: raise Exception( - "validate_measure_references was supposed to raise ValueError for " - "enforce-validate-measures-bad.yml" + "validate_measure_references was supposed to raise ValueError for " "enforce-validate-measures-bad.yml" ) @@ -327,14 +308,10 @@ def test_bad_postprocessing_spec_validation(project_file): er = str(er) assert "bad_partition_column" in er else: - raise Exception( - "validate_options was supposed to raise ValidationError for enforce-validate-options-bad-2.yml" - ) + raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-bad-2.yml") -@pytest.mark.parametrize( - "project_file", [os.path.join(example_yml_dir, "enforce-validate-options-good.yml")] -) +@pytest.mark.parametrize("project_file", [os.path.join(example_yml_dir, "enforce-validate-options-good.yml")]) def test_logic_validation_fail(project_file): try: BuildStockBatchBase.validate_logic(project_file) @@ -344,9 +321,7 @@ def test_logic_validation_fail(project_file): assert "'Vintage' occurs 2 times in a 'and' block" in er assert "'Vintage' occurs 2 times in a '&&' block" in er else: - raise Exception( - "validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml" - ) + raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml") @pytest.mark.parametrize( @@ -362,9 +337,7 @@ def test_number_of_options_apply_upgrade(): proj_filename = resstock_directory / "project_national" / "national_upgrades.yml" cfg = get_project_configuration(str(proj_filename)) cfg["upgrades"][-1]["options"] = cfg["upgrades"][-1]["options"] * 10 - cfg["upgrades"][0]["options"][0]["costs"] = ( - cfg["upgrades"][0]["options"][0]["costs"] * 5 - ) + cfg["upgrades"][0]["options"][0]["costs"] = cfg["upgrades"][0]["options"][0]["costs"] * 5 with tempfile.TemporaryDirectory() as tmpdir: tmppath = pathlib.Path(tmpdir) new_proj_filename = tmppath / "project.yml" @@ -454,11 +427,7 @@ def test_validate_sampler_good_buildstock(basic_residential_project_file): { "sampler": { "type": "precomputed", - "args": { - "sample_file": str( - os.path.join(resources_dir, "buildstock_good.csv") - ) - }, + "args": {"sample_file": str(os.path.join(resources_dir, "buildstock_good.csv"))}, } } ) @@ -470,11 +439,7 @@ def test_validate_sampler_bad_buildstock(basic_residential_project_file): { "sampler": { "type": "precomputed", - "args": { - "sample_file": str( - os.path.join(resources_dir, "buildstock_bad.csv") - ) - }, + "args": {"sample_file": str(os.path.join(resources_dir, "buildstock_bad.csv"))}, } } ) @@ -482,27 +447,10 @@ def test_validate_sampler_bad_buildstock(basic_residential_project_file): BuildStockBatchBase.validate_sampler(project_filename) except ValidationError as er: er = str(er) - assert ( - "Option 1940-1950 in column Vintage of buildstock_csv is not available in options_lookup.tsv" - in er - ) - assert ( - "Option TX in column State of buildstock_csv is not available in options_lookup.tsv" - in er - ) - assert ( - "Option nan in column Insulation Wall of buildstock_csv is not available in options_lookup.tsv" - in er - ) - assert ( - "Column Insulation in buildstock_csv is not available in options_lookup.tsv" - in er - ) - assert ( - "Column ZipPlusCode in buildstock_csv is not available in options_lookup.tsv" - in er - ) + assert "Option 1940-1950 in column Vintage of buildstock_csv is not available in options_lookup.tsv" in er + assert "Option TX in column State of buildstock_csv is not available in options_lookup.tsv" in er + assert "Option nan in column Insulation Wall of buildstock_csv is not available in options_lookup.tsv" in er + assert "Column Insulation in buildstock_csv is not available in options_lookup.tsv" in er + assert "Column ZipPlusCode in buildstock_csv is not available in options_lookup.tsv" in er else: - raise Exception( - "validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml" - ) + raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml") diff --git a/buildstockbatch/utils.py b/buildstockbatch/utils.py index 848ceb79..c74f6521 100644 --- a/buildstockbatch/utils.py +++ b/buildstockbatch/utils.py @@ -43,16 +43,12 @@ def get_project_configuration(project_file): raise err # Set absolute paths - cfg["buildstock_directory"] = path_rel_to_file( - project_file, cfg["buildstock_directory"] - ) + cfg["buildstock_directory"] = path_rel_to_file(project_file, cfg["buildstock_directory"]) # if 'precomputed_sample' in cfg.get('baseline', {}): # cfg['baseline']['precomputed_sample'] = \ # path_rel_to_file(project_file, cfg['baseline']['precomputed_sample']) if "weather_files_path" in cfg: - cfg["weather_files_path"] = path_rel_to_file( - project_file, cfg["weather_files_path"] - ) + cfg["weather_files_path"] = path_rel_to_file(project_file, cfg["weather_files_path"]) return cfg @@ -66,35 +62,20 @@ def _str_repr(obj, list_max=20, dict_max=20, string_max=100): elif type(obj) in [int, float]: return _str_repr(str(obj), list_max, dict_max, string_max) elif type(obj) is list: - txt = "[" + ",".join( - [ - _str_repr(item, list_max, dict_max, string_max) - for item in obj[0:list_max] - ] - ) + txt = "[" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:list_max]]) if len(obj) > list_max: txt += f" ...{len(obj)}" txt += "]" return txt elif type(obj) is tuple: - txt = "(" + ",".join( - [ - _str_repr(item, list_max, dict_max, string_max) - for item in obj[0:list_max] - ] - ) + txt = "(" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:list_max]]) if len(obj) > list_max: txt += f" ...{len(obj)}" txt += ")" return txt elif type(obj) is set: obj = list(obj) - txt = "{" + ",".join( - [ - _str_repr(item, list_max, dict_max, string_max) - for item in obj[0:dict_max] - ] - ) + txt = "{" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:dict_max]]) if len(obj) > dict_max: txt += f" ...{len(obj)}" txt += "}" diff --git a/buildstockbatch/workflow_generator/commercial.py b/buildstockbatch/workflow_generator/commercial.py index c2676c8d..f453cfca 100644 --- a/buildstockbatch/workflow_generator/commercial.py +++ b/buildstockbatch/workflow_generator/commercial.py @@ -50,9 +50,7 @@ def validate(cls, cfg): workflow_generator_args = cfg["workflow_generator"]["args"] schema_yml = re.sub(r"^ {8}", "", schema_yml, flags=re.MULTILINE) schema = yamale.make_schema(content=schema_yml, parser="ruamel") - data = yamale.make_data( - content=json.dumps(workflow_generator_args), parser="ruamel" - ) + data = yamale.make_data(content=json.dumps(workflow_generator_args), parser="ruamel") return yamale.validate(schema, data, strict=True) def reporting_measures(self): @@ -117,31 +115,25 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "arguments": {"run_measure": 1}, } if "upgrade_name" in measure_d: - apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d[ - "upgrade_name" - ] + apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d["upgrade_name"] for opt_num, option in enumerate(measure_d["options"], 1): - apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = ( - option["option"] - ) + apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = option["option"] if "lifetime" in option: - apply_upgrade_measure["arguments"][ - "option_{}_lifetime".format(opt_num) - ] = option["lifetime"] + apply_upgrade_measure["arguments"]["option_{}_lifetime".format(opt_num)] = option["lifetime"] if "apply_logic" in option: - apply_upgrade_measure["arguments"][ - "option_{}_apply_logic".format(opt_num) - ] = self.make_apply_logic_arg(option["apply_logic"]) + apply_upgrade_measure["arguments"]["option_{}_apply_logic".format(opt_num)] = ( + self.make_apply_logic_arg(option["apply_logic"]) + ) for cost_num, cost in enumerate(option.get("costs", []), 1): for arg in ("value", "multiplier"): if arg not in cost: continue - apply_upgrade_measure["arguments"][ - "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) - ] = cost[arg] + apply_upgrade_measure["arguments"]["option_{}_cost_{}_{}".format(opt_num, cost_num, arg)] = ( + cost[arg] + ) if "package_apply_logic" in measure_d: - apply_upgrade_measure["arguments"]["package_apply_logic"] = ( - self.make_apply_logic_arg(measure_d["package_apply_logic"]) + apply_upgrade_measure["arguments"]["package_apply_logic"] = self.make_apply_logic_arg( + measure_d["package_apply_logic"] ) build_existing_model_idx = list( diff --git a/buildstockbatch/workflow_generator/residential_hpxml.py b/buildstockbatch/workflow_generator/residential_hpxml.py index 4456be5b..c48d98f8 100644 --- a/buildstockbatch/workflow_generator/residential_hpxml.py +++ b/buildstockbatch/workflow_generator/residential_hpxml.py @@ -146,18 +146,14 @@ def validate(cls, cfg): workflow_generator_args = cfg["workflow_generator"]["args"] schema_yml = re.sub(r"^ {8}", "", schema_yml, flags=re.MULTILINE) schema = yamale.make_schema(content=schema_yml, parser="ruamel") - data = yamale.make_data( - content=json.dumps(workflow_generator_args), parser="ruamel" - ) + data = yamale.make_data(content=json.dumps(workflow_generator_args), parser="ruamel") yamale.validate(schema, data, strict=True) return cls.validate_measures_and_arguments(cfg) def reporting_measures(self): """Return a list of reporting measures to include in outputs""" workflow_args = self.cfg["workflow_generator"].get("args", {}) - return [ - x["measure_dir_name"] for x in workflow_args.get("reporting_measures", []) - ] + return [x["measure_dir_name"] for x in workflow_args.get("reporting_measures", [])] @staticmethod def validate_measures_and_arguments(cfg): @@ -197,9 +193,7 @@ def get_cfg_path(cfg_path): workflow_args = cfg["workflow_generator"].get("args", {}) if "reporting_measures" in workflow_args.keys(): for reporting_measure in workflow_args["reporting_measures"]: - measure_names[reporting_measure["measure_dir_name"]] = ( - "workflow_generator.args.reporting_measures" - ) + measure_names[reporting_measure["measure_dir_name"]] = "workflow_generator.args.reporting_measures" error_msgs = "" warning_msgs = "" @@ -232,9 +226,7 @@ def get_cfg_path(cfg_path): error_msgs += "* The following multipliers values are invalid: \n" for multiplier, count in invalid_multipliers.items(): error_msgs += f" '{multiplier}' - Used {count} times \n" - error_msgs += ( - f" The list of valid multipliers are {valid_multipliers}.\n" - ) + error_msgs += f" The list of valid multipliers are {valid_multipliers}.\n" if warning_msgs: logger.warning(warning_msgs) @@ -276,8 +268,7 @@ def create_osw(self, sim_id, building_id, upgrade_idx): bld_exist_model_args = { "building_id": building_id, - "sample_weight": self.cfg["baseline"]["n_buildings_represented"] - / self.n_datapoints, + "sample_weight": self.cfg["baseline"]["n_buildings_represented"] / self.n_datapoints, } bld_exist_model_args.update(sim_ctl_args) @@ -300,16 +291,12 @@ def create_osw(self, sim_id, building_id, upgrade_idx): ["emissions_wood_values", "wood_value"], ] for arg, item in emissions_map: - bld_exist_model_args[arg] = ",".join( - [str(s.get(item, "")) for s in emissions] - ) + bld_exist_model_args[arg] = ",".join([str(s.get(item, "")) for s in emissions]) buildstock_dir = self.cfg["buildstock_directory"] measures_dir = os.path.join(buildstock_dir, "measures") measure_path = os.path.join(measures_dir, "BuildExistingModel") - bld_exist_model_args_avail = get_measure_arguments( - os.path.join(measure_path, "measure.xml") - ) + bld_exist_model_args_avail = get_measure_arguments(os.path.join(measure_path, "measure.xml")) if "utility_bills" in workflow_args: utility_bills = workflow_args["utility_bills"] @@ -348,9 +335,7 @@ def create_osw(self, sim_id, building_id, upgrade_idx): ] for arg, item in utility_bills_map: if arg in bld_exist_model_args_avail: - bld_exist_model_args[arg] = ",".join( - [str(s.get(item, "")) for s in utility_bills] - ) + bld_exist_model_args[arg] = ",".join([str(s.get(item, "")) for s in utility_bills]) sim_out_rep_args = { "timeseries_frequency": "none", @@ -373,9 +358,7 @@ def create_osw(self, sim_id, building_id, upgrade_idx): measures_dir = os.path.join(buildstock_dir, "resources/hpxml-measures") measure_path = os.path.join(measures_dir, "ReportSimulationOutput") - sim_out_rep_args_avail = get_measure_arguments( - os.path.join(measure_path, "measure.xml") - ) + sim_out_rep_args_avail = get_measure_arguments(os.path.join(measure_path, "measure.xml")) if "include_annual_total_consumptions" in sim_out_rep_args_avail: sim_out_rep_args["include_annual_total_consumptions"] = True @@ -438,18 +421,14 @@ def create_osw(self, sim_id, building_id, upgrade_idx): if "output_variables" in sim_out_rep_args: output_variables = sim_out_rep_args["output_variables"] - sim_out_rep_args["user_output_variables"] = ",".join( - [str(s.get("name")) for s in output_variables] - ) + sim_out_rep_args["user_output_variables"] = ",".join([str(s.get("name")) for s in output_variables]) sim_out_rep_args.pop("output_variables") util_bills_rep_args = {} measures_dir = os.path.join(buildstock_dir, "resources/hpxml-measures") measure_path = os.path.join(measures_dir, "ReportUtilityBills") - util_bills_rep_args_avail = get_measure_arguments( - os.path.join(measure_path, "measure.xml") - ) + util_bills_rep_args_avail = get_measure_arguments(os.path.join(measure_path, "measure.xml")) if "include_annual_bills" in util_bills_rep_args_avail: util_bills_rep_args["include_annual_bills"] = True @@ -540,36 +519,28 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "arguments": {"run_measure": 1}, } if "upgrade_name" in measure_d: - apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d[ - "upgrade_name" - ] + apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d["upgrade_name"] for opt_num, option in enumerate(measure_d["options"], 1): - apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = ( - option["option"] - ) + apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = option["option"] if "lifetime" in option: - apply_upgrade_measure["arguments"][ - "option_{}_lifetime".format(opt_num) - ] = option["lifetime"] + apply_upgrade_measure["arguments"]["option_{}_lifetime".format(opt_num)] = option["lifetime"] if "apply_logic" in option: - apply_upgrade_measure["arguments"][ - "option_{}_apply_logic".format(opt_num) - ] = self.make_apply_logic_arg(option["apply_logic"]) + apply_upgrade_measure["arguments"]["option_{}_apply_logic".format(opt_num)] = ( + self.make_apply_logic_arg(option["apply_logic"]) + ) for cost_num, cost in enumerate(option.get("costs", []), 1): for arg in ("value", "multiplier"): if arg not in cost: continue - apply_upgrade_measure["arguments"][ - "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) - ] = cost[arg] + apply_upgrade_measure["arguments"]["option_{}_cost_{}_{}".format(opt_num, cost_num, arg)] = ( + cost[arg] + ) if "package_apply_logic" in measure_d: - apply_upgrade_measure["arguments"]["package_apply_logic"] = ( - self.make_apply_logic_arg(measure_d["package_apply_logic"]) + apply_upgrade_measure["arguments"]["package_apply_logic"] = self.make_apply_logic_arg( + measure_d["package_apply_logic"] ) - build_existing_model_idx = [ - x["measure_dir_name"] == "BuildExistingModel" for x in osw["steps"] - ].index(True) + build_existing_model_idx = [x["measure_dir_name"] == "BuildExistingModel" for x in osw["steps"]].index(True) osw["steps"].insert(build_existing_model_idx + 1, apply_upgrade_measure) if "reporting_measures" in workflow_args: @@ -577,8 +548,6 @@ def create_osw(self, sim_id, building_id, upgrade_idx): if "arguments" not in reporting_measure: reporting_measure["arguments"] = {} reporting_measure["measure_type"] = "ReportingMeasure" - osw["steps"].insert( - -1, reporting_measure - ) # right before ServerDirectoryCleanup + osw["steps"].insert(-1, reporting_measure) # right before ServerDirectoryCleanup return osw diff --git a/buildstockbatch/workflow_generator/test_workflow_generator.py b/buildstockbatch/workflow_generator/test_workflow_generator.py index 113c7965..2205e370 100644 --- a/buildstockbatch/workflow_generator/test_workflow_generator.py +++ b/buildstockbatch/workflow_generator/test_workflow_generator.py @@ -13,14 +13,10 @@ def test_apply_logic_recursion(): apply_logic = WorkflowGeneratorBase.make_apply_logic_arg(["one", "two", "three"]) assert apply_logic == "(one&&two&&three)" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg( - {"and": ["one", "two", "three"]} - ) + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"and": ["one", "two", "three"]}) assert apply_logic == "(one&&two&&three)" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg( - {"or": ["four", "five", "six"]} - ) + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"or": ["four", "five", "six"]}) assert apply_logic == "(four||five||six)" apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"not": "seven"}) @@ -77,36 +73,11 @@ def test_residential_hpxml(mocker): build_existing_model_step = steps[0] assert build_existing_model_step["measure_dir_name"] == "BuildExistingModel" - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_begin_month" - ] - == 2 - ) - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_begin_day_of_month" - ] - == 1 - ) - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_end_month" - ] - == 2 - ) - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_end_day_of_month" - ] - == 28 - ) - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_calendar_year" - ] - == 2010 - ) + assert build_existing_model_step["arguments"]["simulation_control_run_period_begin_month"] == 2 + assert build_existing_model_step["arguments"]["simulation_control_run_period_begin_day_of_month"] == 1 + assert build_existing_model_step["arguments"]["simulation_control_run_period_end_month"] == 2 + assert build_existing_model_step["arguments"]["simulation_control_run_period_end_day_of_month"] == 28 + assert build_existing_model_step["arguments"]["simulation_control_run_period_calendar_year"] == 2010 apply_upgrade_step = steps[1] assert apply_upgrade_step["measure_dir_name"] == "ApplyUpgrade" @@ -117,25 +88,13 @@ def test_residential_hpxml(mocker): simulation_output_step = steps[3] assert simulation_output_step["measure_dir_name"] == "ReportSimulationOutput" assert simulation_output_step["arguments"]["timeseries_frequency"] == "hourly" - assert ( - simulation_output_step["arguments"]["include_annual_total_consumptions"] is True - ) - assert ( - simulation_output_step["arguments"]["include_annual_fuel_consumptions"] is True - ) - assert ( - simulation_output_step["arguments"]["include_annual_end_use_consumptions"] - is True - ) - assert ( - simulation_output_step["arguments"]["include_annual_system_use_consumptions"] - is False - ) + assert simulation_output_step["arguments"]["include_annual_total_consumptions"] is True + assert simulation_output_step["arguments"]["include_annual_fuel_consumptions"] is True + assert simulation_output_step["arguments"]["include_annual_end_use_consumptions"] is True + assert simulation_output_step["arguments"]["include_annual_system_use_consumptions"] is False assert simulation_output_step["arguments"]["include_annual_emissions"] is True assert simulation_output_step["arguments"]["include_annual_emission_fuels"] is True - assert ( - simulation_output_step["arguments"]["include_annual_emission_end_uses"] is True - ) + assert simulation_output_step["arguments"]["include_annual_emission_end_uses"] is True assert simulation_output_step["arguments"]["include_annual_total_loads"] is True assert simulation_output_step["arguments"]["include_annual_unmet_hours"] is True assert simulation_output_step["arguments"]["include_annual_peak_fuels"] is True @@ -144,55 +103,22 @@ def test_residential_hpxml(mocker): assert simulation_output_step["arguments"]["include_annual_hot_water_uses"] is True assert simulation_output_step["arguments"]["include_annual_hvac_summary"] is True assert simulation_output_step["arguments"]["include_annual_resilience"] is True - assert ( - simulation_output_step["arguments"]["include_timeseries_total_consumptions"] - is True - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_fuel_consumptions"] - is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_end_use_consumptions"] - is True - ) - assert ( - simulation_output_step["arguments"][ - "include_timeseries_system_use_consumptions" - ] - is False - ) + assert simulation_output_step["arguments"]["include_timeseries_total_consumptions"] is True + assert simulation_output_step["arguments"]["include_timeseries_fuel_consumptions"] is False + assert simulation_output_step["arguments"]["include_timeseries_end_use_consumptions"] is True + assert simulation_output_step["arguments"]["include_timeseries_system_use_consumptions"] is False assert simulation_output_step["arguments"]["include_timeseries_emissions"] is False - assert ( - simulation_output_step["arguments"]["include_timeseries_emission_fuels"] - is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_emission_end_uses"] - is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_hot_water_uses"] - is False - ) + assert simulation_output_step["arguments"]["include_timeseries_emission_fuels"] is False + assert simulation_output_step["arguments"]["include_timeseries_emission_end_uses"] is False + assert simulation_output_step["arguments"]["include_timeseries_hot_water_uses"] is False assert simulation_output_step["arguments"]["include_timeseries_total_loads"] is True - assert ( - simulation_output_step["arguments"]["include_timeseries_component_loads"] - is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_unmet_hours"] is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_zone_temperatures"] - is False - ) + assert simulation_output_step["arguments"]["include_timeseries_component_loads"] is False + assert simulation_output_step["arguments"]["include_timeseries_unmet_hours"] is False + assert simulation_output_step["arguments"]["include_timeseries_zone_temperatures"] is False assert simulation_output_step["arguments"]["include_timeseries_airflows"] is False assert simulation_output_step["arguments"]["include_timeseries_weather"] is False assert simulation_output_step["arguments"]["include_timeseries_resilience"] is False - assert ( - simulation_output_step["arguments"]["timeseries_timestamp_convention"] == "end" - ) + assert simulation_output_step["arguments"]["timeseries_timestamp_convention"] == "end" assert simulation_output_step["arguments"]["timeseries_num_decimal_places"] == 3 assert simulation_output_step["arguments"]["add_timeseries_dst_column"] is True assert simulation_output_step["arguments"]["add_timeseries_utc_column"] is True @@ -334,9 +260,7 @@ def test_com_default_workflow_generator_extended(mocker): assert reporting_measure_step["measure_type"] == "ReportingMeasure" assert reporting_measure_step["arguments"] == {} # Should only be one instance of SimulationOutputReport - assert [ - d["measure_dir_name"] == "SimulationOutputReport" for d in osw["steps"] - ].count(True) == 1 + assert [d["measure_dir_name"] == "SimulationOutputReport" for d in osw["steps"]].count(True) == 1 # Should get TimeseriesCSVExport if included in args reporting_measure_step = osw["steps"][1] assert reporting_measure_step["measure_dir_name"] == "TimeseriesCSVExport" @@ -345,10 +269,7 @@ def test_com_default_workflow_generator_extended(mocker): assert reporting_measure_step["arguments"]["inc_output_variables"] == "true" # Should have the openstudio report reporting_measure_step = osw["steps"][2] - assert ( - reporting_measure_step["measure_dir_name"] - == "f8e23017-894d-4bdf-977f-37e3961e6f42" - ) + assert reporting_measure_step["measure_dir_name"] == "f8e23017-894d-4bdf-977f-37e3961e6f42" assert reporting_measure_step["measure_type"] == "ReportingMeasure" assert reporting_measure_step["arguments"]["building_summary_section"] == "true" assert reporting_measure_step["arguments"]["schedules_overview_section"] == "true" diff --git a/docs/conf.py b/docs/conf.py index 45c44c52..94ca7931 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,9 +20,7 @@ here = os.path.abspath(os.path.dirname(__file__)) metadata = {} -with open( - os.path.join(here, "..", "buildstockbatch", "__version__.py"), "r", encoding="utf-8" -) as f: +with open(os.path.join(here, "..", "buildstockbatch", "__version__.py"), "r", encoding="utf-8") as f: exec(f.read(), metadata) # -- Project information ----------------------------------------------------- @@ -75,9 +73,7 @@ # how to render changelog links changelog_render_ticket = "http://www.github.com/nrel/buildstockbatch/issues/%s" -changelog_render_pullreq = { - "default": "https://www.github.com/nrel/buildstockbatch/pull/%s" -} +changelog_render_pullreq = {"default": "https://www.github.com/nrel/buildstockbatch/pull/%s"} # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -180,9 +176,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, "buildstockbatch", "BuildStock Batch Documentation", [author], 1) -] +man_pages = [(master_doc, "buildstockbatch", "BuildStock Batch Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- diff --git a/setup.py b/setup.py index 5e45e2f3..ce2e4d63 100644 --- a/setup.py +++ b/setup.py @@ -8,9 +8,7 @@ here = os.path.abspath(os.path.dirname(__file__)) metadata = {} -with open( - os.path.join(here, "buildstockbatch", "__version__.py"), "r", encoding="utf-8" -) as f: +with open(os.path.join(here, "buildstockbatch", "__version__.py"), "r", encoding="utf-8") as f: exec(f.read(), metadata) with open("README.md", "r", "utf-8") as f: From cec7e02b93e48494bd67cb26b658a71589a85c82 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 17:12:29 +0000 Subject: [PATCH 35/53] running postprocessing from within a docker container to fix version mismatch issues --- buildstockbatch/aws/aws.py | 74 +++++++++++++++++++++++----- buildstockbatch/cloud/docker_base.py | 5 +- buildstockbatch/hpc.py | 5 +- buildstockbatch/postprocessing.py | 2 +- buildstockbatch/utils.py | 4 ++ 5 files changed, 73 insertions(+), 17 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index e261460d..ea0a5e2a 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -13,7 +13,7 @@ import base64 import boto3 from botocore.exceptions import ClientError -import collections +from copy import deepcopy import csv from dask.distributed import Client from dask_cloudprovider.aws import FargateCluster @@ -29,18 +29,21 @@ from s3fs import S3FileSystem import tarfile import re +import tempfile import time import tqdm import io +import yaml from buildstockbatch.base import ValidationError, BuildStockBatchBase from buildstockbatch.aws.awsbase import AwsJobBase, boto_client_config +from buildstockbatch.cloud.docker_base import DockerBatchBase from buildstockbatch import postprocessing from buildstockbatch.utils import ( ContainerRuntime, log_error_details, get_project_configuration, - read_csv, + get_bool_env_var, ) logger = logging.getLogger(__name__) @@ -278,7 +281,6 @@ def create_vpc(self): # Create and elastic IP for the NAT Gateway try: - ip_response = self.ec2.allocate_address(Domain="vpc") self.nat_ip_allocation = ip_response["AllocationId"] @@ -743,7 +745,6 @@ def submit_job(self, array_size=4): return resp except Exception as e: - if "not in VALID state" in str(e): # Need to wait a second for the compute environment to complete registration logger.warning("5 second sleep initiated to wait for job queue creation due to error: " + str(e)) @@ -789,7 +790,6 @@ def clean(self): response = dsg.revoke_egress(IpPermissions=dsg.ip_permissions_egress) try: - self.batch.update_job_queue(jobQueue=self.batch_job_queue_name, state="DISABLED") while True: @@ -1180,9 +1180,6 @@ def clean(self): batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) batch_env.clean() - sns_env = AwsSNS(self.job_identifier, self.cfg["aws"], self.boto3_session) - sns_env.clean() - def upload_batch_files_to_cloud(self, tmppath): """Implements :func:`DockerBase.upload_batch_files_to_cloud`""" logger.debug("Uploading Batch files to S3") @@ -1245,11 +1242,11 @@ def start_batch_job(self, batch_info): ) # start job - job_info = batch_env.submit_job(array_size=array_size) + job_info = batch_env.submit_job(array_size=self.batch_array_size) # Monitor job status n_succeeded_last_time = 0 - with tqdm.tqdm(desc="Running Simulations", total=array_size) as progress_bar: + with tqdm.tqdm(desc="Running Simulations", total=self.batch_array_size) as progress_bar: job_status = None while job_status not in ("SUCCEEDED", "FAILED"): time.sleep(10) @@ -1352,7 +1349,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): logger.debug("Extracting {}".format(epw_filename)) f_out.write(gzip.decompress(f_gz.getvalue())) - cls.run_simulations(cfg, jobs_d, job_id, sim_dir, S3FileSystem(), f"{bucket}/{prefix}") + cls.run_simulations(cfg, job_id, jobs_d, sim_dir, S3FileSystem(), f"{bucket}/{prefix}") def get_fs(self): return S3FileSystem() @@ -1386,6 +1383,55 @@ def upload_results(self, *args, **kwargs): """Do nothing because the results are already on S3""" return self.s3_bucket, self.s3_bucket_prefix + "/results/parquet" + def process_results(self, *args, **kwargs): + with tempfile.TemporaryDirectory() as tmpdir: + tmppath = pathlib.Path(tmpdir) + container_workpath = pathlib.PurePosixPath("/var/simdata/openstudio") + + cfg = deepcopy(self.cfg) + container_buildstock_dir = str(container_workpath / "buildstock") + cfg["buildstock_directory"] = container_buildstock_dir + cfg["project_directory"] = str(pathlib.Path(self.project_dir).relative_to(self.buildstock_dir)) + + with open(tmppath / "project_config.yml", "w") as f: + f.write(yaml.dump(cfg, Dumper=yaml.SafeDumper)) + container_cfg_path = str(container_workpath / "project_config.yml") + + with open(tmppath / "args.json", "w") as f: + json.dump([args, kwargs], f) + + credentials = boto3.Session().get_credentials().get_frozen_credentials() + env = { + "AWS_ACCESS_KEY_ID": credentials.access_key, + "AWS_SECRET_ACCESS_KEY": credentials.secret_key, + } + if credentials.token: + env["AWS_SESSION_TOKEN"] = credentials.token + env["POSTPROCESSING_INSIDE_DOCKER_CONTAINER"] = "true" + + logger.info("Starting container for postprocessing") + container = self.docker_client.containers.run( + self.image_url, + ["python3", "-m", "buildstockbatch.aws.aws", container_cfg_path], + volumes={ + tmpdir: {"bind": str(container_workpath), "mode": "rw"}, + self.buildstock_dir: {"bind": container_buildstock_dir, "mode": "ro"}, + }, + environment=env, + name="bsb_post", + auto_remove=True, + detach=True, + ) + for msg in container.logs(stream=True): + logger.debug(msg) + + def _process_results_inside_container(self): + with open("/var/simdata/openstudio/args.json", "r") as f: + args, kwargs = json.load(f) + + logger.info("Running postprocessing in container") + super().process_results(*args, **kwargs) + @log_error_details() def main(): @@ -1429,6 +1475,12 @@ def main(): job_name = os.environ["JOB_NAME"] region = os.environ["REGION"] AwsBatch.run_job(job_id, s3_bucket, s3_prefix, job_name, region) + elif get_bool_env_var("POSTPROCESSING_INSIDE_DOCKER_CONTAINER"): + parser = argparse.ArgumentParser() + parser.add_argument("project_filename") + args = parser.parse_args() + batch = AwsBatch(args.project_filename) + batch._process_results_inside_container() else: parser = argparse.ArgumentParser() parser.add_argument("project_filename") diff --git a/buildstockbatch/cloud/docker_base.py b/buildstockbatch/cloud/docker_base.py index 2386c838..3176a23a 100644 --- a/buildstockbatch/cloud/docker_base.py +++ b/buildstockbatch/cloud/docker_base.py @@ -30,7 +30,7 @@ from buildstockbatch import postprocessing from buildstockbatch.base import BuildStockBatchBase -from buildstockbatch.utils import ContainerRuntime, calc_hash_for_file, compress_file, read_csv +from buildstockbatch.utils import ContainerRuntime, calc_hash_for_file, compress_file, read_csv, get_bool_env_var logger = logging.getLogger(__name__) @@ -57,6 +57,9 @@ class BatchInfo: def __init__(self, project_filename): super().__init__(project_filename) + if get_bool_env_var("POSTPROCESSING_INSIDE_DOCKER_CONTAINER"): + return + self.docker_client = docker.DockerClient.from_env() try: self.docker_client.ping() diff --git a/buildstockbatch/hpc.py b/buildstockbatch/hpc.py index dab768dd..ade93702 100644 --- a/buildstockbatch/hpc.py +++ b/buildstockbatch/hpc.py @@ -41,6 +41,7 @@ path_rel_to_file, get_project_configuration, read_csv, + get_bool_env_var, ) from buildstockbatch import postprocessing from buildstockbatch.__version__ import __version__ as bsb_version @@ -49,10 +50,6 @@ logger = logging.getLogger(__name__) -def get_bool_env_var(varname): - return os.environ.get(varname, "0").lower() in ("true", "t", "1", "y", "yes") - - class SlurmBatch(BuildStockBatchBase): DEFAULT_SYS_IMAGE_DIR = None HPC_NAME = None diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 18c60e11..0937185c 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -226,7 +226,7 @@ def read_results_json(fs, filename, all_cols=None): for missing_col in set(all_cols).difference(df.columns.values): df[missing_col] = None # Sorting is needed to ensure all dfs have same column order. Dask will fail otherwise. - df = df.reindex(sorted(df.columns), axis=1).convert_dtypes(dtype_backend='pyarrow') + df = df.reindex(sorted(df.columns), axis=1).convert_dtypes(dtype_backend="pyarrow") return df diff --git a/buildstockbatch/utils.py b/buildstockbatch/utils.py index ea4b503c..e9453b38 100644 --- a/buildstockbatch/utils.py +++ b/buildstockbatch/utils.py @@ -139,3 +139,7 @@ def compress_file(in_filename, out_filename): def calc_hash_for_file(filename): with open(filename, "rb") as f: return hashlib.sha256(f.read()).hexdigest() + + +def get_bool_env_var(varname): + return os.environ.get(varname, "0").lower() in ("true", "t", "1", "y", "yes") From 38f3235ced91b5096fcf0b9f053cc758f0d53b3d Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 10:22:36 -0700 Subject: [PATCH 36/53] updating ci to use black for linting --- .github/workflows/ci.yml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d49d2a0f..59159e8a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,13 +43,10 @@ jobs: cd buildstockbatch python -m pip install --progress-bar off --upgrade pip pip install .[dev,aws] --progress-bar off - - name: Linting - run: | - cd buildstockbatch - # stop the build if there are Python syntax errors or undefined names - flake8 buildstockbatch --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. - flake8 buildstockbatch --count --statistics --exit-zero + - name: Black + uses: psf/black@stable + with: + src: "./buildstockbatch" - name: Run PyTest and Coverage run: | cd buildstockbatch @@ -84,5 +81,3 @@ jobs: with: name: documentation path: buildstockbatch/docs/_build/html/ - - uses: pre-commit-ci/lite-action@v1.0.1 - if: always() From 96dfd3ef47c074405da94e2e8820d35101aca0aa Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 12:05:52 -0700 Subject: [PATCH 37/53] running pre-commit --- .devcontainer/devcontainer.json | 2 +- .flake8 | 2 +- .readthedocs.yml | 2 +- LICENSE.md | 2 +- buildstockbatch/aws/awsbase.py | 2 - buildstockbatch/schemas/v0.1.yaml | 2 +- .../enforce-validate-options-good.yml | 2 +- .../resources/Gemfile | 2 +- .../resources/options_lookup.tsv | 42 +++++++++---------- .../simulations_job0/up00/bldg0000001/in.osw | 2 +- .../up00/bldg0000001/run/data_point_out.json | 2 +- .../simulations_job0/up00/bldg0000002/in.osw | 2 +- .../up00/bldg0000002/run/failed.job | 2 +- .../simulations_job0/up00/bldg0000003/in.osw | 2 +- .../up00/bldg0000003/run/data_point_out.json | 2 +- .../simulations_job0/up00/bldg0000004/in.osw | 2 +- .../up00/bldg0000004/run/data_point_out.json | 2 +- .../simulations_job0/up01/bldg0000001/in.osw | 2 +- .../up01/bldg0000001/run/data_point_out.json | 2 +- .../simulations_job0/up01/bldg0000002/in.osw | 2 +- .../up01/bldg0000002/run/data_point_out.json | 2 +- .../simulations_job0/up01/bldg0000003/in.osw | 2 +- .../up01/bldg0000003/run/data_point_out.json | 2 +- .../simulations_job0/up01/bldg0000004/in.osw | 2 +- .../up01/bldg0000004/run/data_point_out.json | 2 +- docs/Makefile | 2 +- docs/changelog/changelog_0_18.rst | 2 +- docs/changelog/changelog_0_19.rst | 6 +-- docs/changelog/changelog_0_20.rst | 2 +- docs/changelog/changelog_0_21.rst | 6 +-- docs/changelog/changelog_2023_01_0.rst | 2 +- docs/changelog/changelog_2023_10_0.rst | 2 +- docs/changelog/changelog_dev.rst | 2 +- docs/changelog/migration_0_19.rst | 2 +- docs/changelog/migration_0_20.rst | 2 +- docs/changelog/migration_0_21.rst | 2 +- docs/changelog/migration_2022_10_0.rst | 2 +- docs/changelog/migration_2022_12_0.rst | 2 +- docs/changelog/migration_2023_01_0.rst | 2 +- docs/changelog/migration_2023_05_0.rst | 2 +- docs/samplers/precomputed.rst | 4 +- docs/samplers/residential_quota.rst | 2 +- .../samplers/residential_quota_downselect.rst | 4 +- docs/workflow_generators/index.rst | 2 +- .../workflow_generators/residential_hpxml.rst | 2 +- 45 files changed, 70 insertions(+), 72 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ce9313b7..2f5c6ae4 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -10,7 +10,7 @@ "dockerFile": "../Dockerfile", // Set *default* container specific settings.json values on container create. - "settings": { + "settings": { "terminal.integrated.shell.linux": null }, diff --git a/.flake8 b/.flake8 index 79a16af7..6deafc26 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,2 @@ [flake8] -max-line-length = 120 \ No newline at end of file +max-line-length = 120 diff --git a/.readthedocs.yml b/.readthedocs.yml index 2e6437be..58e565ff 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -13,4 +13,4 @@ python: - method: pip path: . extra_requirements: - - dev \ No newline at end of file + - dev diff --git a/LICENSE.md b/LICENSE.md index ac905635..70d64662 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -8,4 +8,4 @@ Redistribution and use in source and binary forms, with or without modification, 4. Redistribution of this software, without modification, must refer to the software by the same designation. Redistribution of a modified version of this software (i) may not refer to the modified version by the same designation, or by any confusingly similar designation, and (ii) must refer to the underlying software originally provided by Alliance as “ResStock”. Except to comply with the foregoing, the term “ResStock”, or any confusingly similar designation may not be used to refer to any modified version of this software or any modified version of the underlying software originally provided by Alliance without the prior written consent of Alliance. 5. The name of the copyright holder(s), any contributors, the United States Government, the United States Department of Energy, or any of their employees may not be used to endorse or promote products derived from this software without specific prior written permission from the respective party. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND ANY CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S), ANY CONTRIBUTORS, THE UNITED STATES GOVERNMENT, OR THE UNITED STATES DEPARTMENT OF ENERGY, NOR ANY OF THEIR EMPLOYEES, BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND ANY CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S), ANY CONTRIBUTORS, THE UNITED STATES GOVERNMENT, OR THE UNITED STATES DEPARTMENT OF ENERGY, NOR ANY OF THEIR EMPLOYEES, BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index bcb9a3bf..7ecbf097 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -67,7 +67,6 @@ def role_stitcher( p_counter = p_counter + 1 for managed_policy_arn in managed_policie_arns: - response = self.iam.attach_role_policy(PolicyArn=managed_policy_arn, RoleName=role_name) logger.info(f"Role {role_name} created") @@ -138,7 +137,6 @@ def remove_role_from_instance_profile(self, instance_profile_name): class AwsJobBase: - logger.propagate = False def __init__(self, job_identifier, aws_config, boto3_session): diff --git a/buildstockbatch/schemas/v0.1.yaml b/buildstockbatch/schemas/v0.1.yaml index 2b72910c..df66609e 100644 --- a/buildstockbatch/schemas/v0.1.yaml +++ b/buildstockbatch/schemas/v0.1.yaml @@ -124,4 +124,4 @@ s3-aws-postprocessing-spec: athena-aws-postprocessing-spec: glue_service_role: str(required=False) database_name: str(required=True) - max_crawling_time: num(requried=False) \ No newline at end of file + max_crawling_time: num(requried=False) diff --git a/buildstockbatch/test/test_inputs/enforce-validate-options-good.yml b/buildstockbatch/test/test_inputs/enforce-validate-options-good.yml index 7a26570d..6b0dc1d8 100644 --- a/buildstockbatch/test/test_inputs/enforce-validate-options-good.yml +++ b/buildstockbatch/test/test_inputs/enforce-validate-options-good.yml @@ -12,7 +12,7 @@ upgrades: - or: - Insulation Slab|Good Option - Insulation Slab|None - - not: + - not: - Insulation Wall|Good Option - Insulation Wall|Good Option # Two Insulation Wall under 'not'. Should be caught by logic validator - and: diff --git a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/Gemfile b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/Gemfile index ffadc77f..4e30acd4 100644 --- a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/Gemfile +++ b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/Gemfile @@ -28,4 +28,4 @@ group :native_ext do end # leave this line in for now as we may try to get nokogiri to compile correctly on windows -# gem 'nokogiri', '= 1.11.0.rc1.20200331222433', :github => 'jmarrec/nokogiri', :ref => 'MSVC_support' # master of 2020-03-31 + gemspec commit \ No newline at end of file +# gem 'nokogiri', '= 1.11.0.rc1.20200331222433', :github => 'jmarrec/nokogiri', :ref => 'MSVC_support' # master of 2020-03-31 + gemspec commit diff --git a/buildstockbatch/test/test_inputs/test_openstudio_buildstock_bad/resources/options_lookup.tsv b/buildstockbatch/test/test_inputs/test_openstudio_buildstock_bad/resources/options_lookup.tsv index 1237f8fb..d8569f79 100644 --- a/buildstockbatch/test/test_inputs/test_openstudio_buildstock_bad/resources/options_lookup.tsv +++ b/buildstockbatch/test/test_inputs/test_openstudio_buildstock_bad/resources/options_lookup.tsv @@ -1,29 +1,29 @@ -Parameter Name Option Name Measure Dir Measure Arg 1 Measure Arg 2 ... -Location AL_Birmingham.Muni.AP.722280 -Location AL_Huntsville.Intl.AP-Jones.Field.723230 -Location AL_Mobile-Rgnl.AP.722230 -Vintage <1940 -Vintage 1940s -Vintage <1950 -Vintage 1950s -Vintage 1960s -Vintage 1970s -Vintage 1980s -Vintage 1990s -Vintage 2000s -Vintage 2010s +Parameter Name Option Name Measure Dir Measure Arg 1 Measure Arg 2 ... +Location AL_Birmingham.Muni.AP.722280 +Location AL_Huntsville.Intl.AP-Jones.Field.723230 +Location AL_Mobile-Rgnl.AP.722230 +Vintage <1940 +Vintage 1940s +Vintage <1950 +Vintage 1950s +Vintage 1960s +Vintage 1970s +Vintage 1980s +Vintage 1990s +Vintage 2000s +Vintage 2010s Insulation Slab None Insulation Slab(Good) Option ResidentialConstructionsSlab perimeter_r=0 perimeter_width=0 whole_r=0 gap_r=0 exterior_r=0 exterior_depth=0 -Insulation Slab Missing Argument ResidentialConstructionsSlab perimeter_r=0 perimeter_width=0 whole_r=10 gap_r=5 exterior_r=0 +Insulation Slab Missing Argument ResidentialConstructionsSlab perimeter_r=0 perimeter_width=0 whole_r=10 gap_r=5 exterior_r=0 Insulation Unfinished&Basement None -Insulation Unfinished Basement Good Option ResidentialConstructionsUnfinishedBasement wall_ins_height=0 wall_cavity_r=0 wall_install_grade=1 wall_cavity_depth_in=0 wall_filled_cavity=true wall_framing_factor=0 wall_rigid_r=0 wall_drywall_thick_in=0.5 ceiling_cavity_r=0 ceiling_install_grade=1 ceiling_framing_factor=0.13 ceiling_joist_height_in=9.25 slab_whole_r=0 +Insulation Unfinished Basement Good Option ResidentialConstructionsUnfinishedBasement wall_ins_height=0 wall_cavity_r=0 wall_install_grade=1 wall_cavity_depth_in=0 wall_filled_cavity=true wall_framing_factor=0 wall_rigid_r=0 wall_drywall_thick_in=0.5 ceiling_cavity_r=0 ceiling_install_grade=1 ceiling_framing_factor=0.13 ceiling_joist_height_in=9.25 slab_whole_r=0 Insulation Unfinished Basement Extra Argument ResidentialConstructionsUnfinishedBasement wall_ins_height=0 wall_cavity_r=0 wall_install_grade=1 wall_cavity_depth_in=0 wall_filled_cavity=true wall_framing_factor=0 wall_rigid_r=0 wall_drywall_thick_in=0.5 ceiling_cavity_r=13 ceiling_install_grade=1 ceiling_framing_factor=0.13 ceiling_joist_height_in=9.25 slab_whole_r=0 extra_arg=1 Insulation Finished|Basement None -Insulation Finished Basement Good Option ResidentialConstructionsFinishedBasement wall_ins_height=0 wall_cavity_r=0 wall_install_grade=1 wall_cavity_depth_in=0 wall_filled_cavity=true wall_framing_factor=0 wall_rigid_r=0 wall_drywall_thick_in=0.5 slab_whole_r=0 -Insulation Finished Basement Bad Value ResidentialConstructionsFinishedBasement wall_ins_height=4 wall_cavity_r=0 wall_install_grade=1 wall_cavity_depth_in=0 wall_filled_cavity=1.5 wall_framing_factor=0 wall_rigid_r=5 wall_drywall_thick_in=0.5 slab_whole_r=0 -Insulation Wall Good Option ResidentialConstructionsWallsWoodStud cavity_r=0 install_grade=1 cavity_depth_in=3.5 cavity_filled=false framing_factor=0.25 drywall_thick_in=0.5 osb_thick_in=0.5 rigid_r=0.0 "exterior_finish=Vinyl, Light" -Insulation Wall Missing Measure ResidentialConstructionsWallsWoodStud cavity_r=0 install_grade=1 cavity_depth_in=3.5 cavity_filled=false framing_factor=0.25 drywall_thick_in=0.5 osb_thick_in=0.5 rigid_r=0.0 "exterior_finish=Vinyl, Light" - ResidentialMissingMeasure +Insulation Finished Basement Good Option ResidentialConstructionsFinishedBasement wall_ins_height=0 wall_cavity_r=0 wall_install_grade=1 wall_cavity_depth_in=0 wall_filled_cavity=true wall_framing_factor=0 wall_rigid_r=0 wall_drywall_thick_in=0.5 slab_whole_r=0 +Insulation Finished Basement Bad Value ResidentialConstructionsFinishedBasement wall_ins_height=4 wall_cavity_r=0 wall_install_grade=1 wall_cavity_depth_in=0 wall_filled_cavity=1.5 wall_framing_factor=0 wall_rigid_r=5 wall_drywall_thick_in=0.5 slab_whole_r=0 +Insulation Wall Good Option ResidentialConstructionsWallsWoodStud cavity_r=0 install_grade=1 cavity_depth_in=3.5 cavity_filled=false framing_factor=0.25 drywall_thick_in=0.5 osb_thick_in=0.5 rigid_r=0.0 "exterior_finish=Vinyl, Light" +Insulation Wall Missing Measure ResidentialConstructionsWallsWoodStud cavity_r=0 install_grade=1 cavity_depth_in=3.5 cavity_filled=false framing_factor=0.25 drywall_thick_in=0.5 osb_thick_in=0.5 rigid_r=0.0 "exterior_finish=Vinyl, Light" + ResidentialMissingMeasure Wall Insulation * perimeter_r=0 perimeter_width=0 whole_r=0 gap_r=0 exterior_r=0 exterior_depth=0 Wall Insulation R1 perimeter_r=0 perimeter_width=0 whole_r=0 gap_r=0 exterior_r=0 exterior_depth=0 Ceiling Insulation R1 perimeter_r=0 perimeter_width=0 whole_r=0 gap_r=0 exterior_r=0 exterior_depth=0 diff --git a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000001/in.osw b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000001/in.osw index 0ef9cac7..e1f38b4f 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000001/in.osw +++ b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000001/in.osw @@ -41,4 +41,4 @@ "measure_paths": [ "measures" ] -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000001/run/data_point_out.json b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000001/run/data_point_out.json index f10e0f6f..0ba7d9bf 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000001/run/data_point_out.json +++ b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000001/run/data_point_out.json @@ -222,4 +222,4 @@ "ServerDirectoryCleanup": { "applicable": true } -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000002/in.osw b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000002/in.osw index 8b04a99b..5778226e 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000002/in.osw +++ b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000002/in.osw @@ -41,4 +41,4 @@ "measure_paths": [ "measures" ] -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000002/run/failed.job b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000002/run/failed.job index 7c386d74..bb197a33 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000002/run/failed.job +++ b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000002/run/failed.job @@ -1 +1 @@ -Failed Workflow 2020-10-08 02:13:18 +0000 \ No newline at end of file +Failed Workflow 2020-10-08 02:13:18 +0000 diff --git a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000003/in.osw b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000003/in.osw index 46889feb..8556d6e3 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000003/in.osw +++ b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000003/in.osw @@ -41,4 +41,4 @@ "measure_paths": [ "measures" ] -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000003/run/data_point_out.json b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000003/run/data_point_out.json index b5f0978a..ddcb1175 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000003/run/data_point_out.json +++ b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000003/run/data_point_out.json @@ -222,4 +222,4 @@ "ServerDirectoryCleanup": { "applicable": true } -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000004/in.osw b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000004/in.osw index f5a527d8..c21616c0 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000004/in.osw +++ b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000004/in.osw @@ -41,4 +41,4 @@ "measure_paths": [ "measures" ] -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000004/run/data_point_out.json b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000004/run/data_point_out.json index 819dee83..bdc0600d 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000004/run/data_point_out.json +++ b/buildstockbatch/test/test_results/simulations_job0/up00/bldg0000004/run/data_point_out.json @@ -222,4 +222,4 @@ "ServerDirectoryCleanup": { "applicable": true } -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000001/in.osw b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000001/in.osw index 89f87b84..8be4b1d6 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000001/in.osw +++ b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000001/in.osw @@ -49,4 +49,4 @@ "measure_paths": [ "measures" ] -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000001/run/data_point_out.json b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000001/run/data_point_out.json index 95e549c3..3b35317e 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000001/run/data_point_out.json +++ b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000001/run/data_point_out.json @@ -283,4 +283,4 @@ "ServerDirectoryCleanup": { "applicable": true } -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000002/in.osw b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000002/in.osw index 4deb0505..f9a219a8 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000002/in.osw +++ b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000002/in.osw @@ -49,4 +49,4 @@ "measure_paths": [ "measures" ] -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000002/run/data_point_out.json b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000002/run/data_point_out.json index ff478bb4..86514c13 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000002/run/data_point_out.json +++ b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000002/run/data_point_out.json @@ -283,4 +283,4 @@ "ServerDirectoryCleanup": { "applicable": true } -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000003/in.osw b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000003/in.osw index 49414dc4..382a1e2f 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000003/in.osw +++ b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000003/in.osw @@ -49,4 +49,4 @@ "measure_paths": [ "measures" ] -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000003/run/data_point_out.json b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000003/run/data_point_out.json index eed3d0c3..742a8921 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000003/run/data_point_out.json +++ b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000003/run/data_point_out.json @@ -283,4 +283,4 @@ "ServerDirectoryCleanup": { "applicable": true } -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000004/in.osw b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000004/in.osw index 813af34a..169e94ea 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000004/in.osw +++ b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000004/in.osw @@ -49,4 +49,4 @@ "measure_paths": [ "measures" ] -} \ No newline at end of file +} diff --git a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000004/run/data_point_out.json b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000004/run/data_point_out.json index 9cb23c69..b12249db 100644 --- a/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000004/run/data_point_out.json +++ b/buildstockbatch/test/test_results/simulations_job0/up01/bldg0000004/run/data_point_out.json @@ -283,4 +283,4 @@ "ServerDirectoryCleanup": { "applicable": true } -} \ No newline at end of file +} diff --git a/docs/Makefile b/docs/Makefile index 169e2fe7..d057e3f6 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -19,4 +19,4 @@ livehtml: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/changelog/changelog_0_18.rst b/docs/changelog/changelog_0_18.rst index 6b09ca7e..09fd510d 100644 --- a/docs/changelog/changelog_0_18.rst +++ b/docs/changelog/changelog_0_18.rst @@ -110,4 +110,4 @@ :tags: comstock, workflow, feature :pullreq: 65 - Added :class:`~.workflow_generator.CommercialDefaultWorkflowGenerator` \ No newline at end of file + Added :class:`~.workflow_generator.CommercialDefaultWorkflowGenerator` diff --git a/docs/changelog/changelog_0_19.rst b/docs/changelog/changelog_0_19.rst index d0633c29..3df99bb8 100644 --- a/docs/changelog/changelog_0_19.rst +++ b/docs/changelog/changelog_0_19.rst @@ -47,8 +47,8 @@ :pullreq: 179 :tickets: 178 - Fixing dependency conflict in installation. - + Fixing dependency conflict in installation. + .. change:: :tags: schema, change :pullreq: 177 @@ -78,7 +78,7 @@ :tags: aws, bugfix :pullreq: 163 - Fix bug in AWS postprocessing. + Fix bug in AWS postprocessing. .. change:: :tags: bugfix, aws diff --git a/docs/changelog/changelog_0_20.rst b/docs/changelog/changelog_0_20.rst index ac1a6d3f..13757933 100644 --- a/docs/changelog/changelog_0_20.rst +++ b/docs/changelog/changelog_0_20.rst @@ -18,7 +18,7 @@ v0.20 Changelog :tags: documentation :pullreq: 203 - https://github.com/NREL/OpenStudio-BuildStock was renamed to https://github.com/NREL/resstock so references to + https://github.com/NREL/OpenStudio-BuildStock was renamed to https://github.com/NREL/resstock so references to OpenStudio-BuildStock in docs were replaced with references to resstock and/or comstock. .. change:: diff --git a/docs/changelog/changelog_0_21.rst b/docs/changelog/changelog_0_21.rst index 2fc6869b..ecd5f000 100644 --- a/docs/changelog/changelog_0_21.rst +++ b/docs/changelog/changelog_0_21.rst @@ -9,19 +9,19 @@ v0.21 Changelog .. change:: :tags: bugfix :pullreq: 232 - :tickets: + :tickets: There was a few days there when the version of some sublibrary (click) of dask was incompatible with the latest version of dask. We temporarily pinned the sublibrary so that new installs would work. They have fixed - that problem now, so this removes the restriction on that library. + that problem now, so this removes the restriction on that library. .. change:: :tags: bugfix :pullreq: 234 :tickets: - For ResStock the ``build_existing_model.sample_weight`` was inverse to what we would expect. The bug was + For ResStock the ``build_existing_model.sample_weight`` was inverse to what we would expect. The bug was identified in the residential workflow generator. .. change:: diff --git a/docs/changelog/changelog_2023_01_0.rst b/docs/changelog/changelog_2023_01_0.rst index 6d7fdd13..29b4439b 100644 --- a/docs/changelog/changelog_2023_01_0.rst +++ b/docs/changelog/changelog_2023_01_0.rst @@ -15,7 +15,7 @@ v2023.01.0 Changelog Adds ``include_timeseries_unmet_hours`` and ``timeseries_num_decimal_places`` arguments to the Residential HPXML Workflow Generator. - + .. change:: :tags: bugfix, validation :pulreq: 342 diff --git a/docs/changelog/changelog_2023_10_0.rst b/docs/changelog/changelog_2023_10_0.rst index df9584f2..42b8abec 100644 --- a/docs/changelog/changelog_2023_10_0.rst +++ b/docs/changelog/changelog_2023_10_0.rst @@ -18,7 +18,7 @@ v2023.10.0 Changelog :pullreq: 384 Introduce '*' as a valid option name in options_lookup.tsv to indicate a - parameter that can take any option and don't need to pass arguments to + parameter that can take any option and don't need to pass arguments to measures. Enables buildstock.csv validation for ComStock without blowing up the size of the options_lookup.tsv file. diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst index 5195d010..bd619d14 100644 --- a/docs/changelog/changelog_dev.rst +++ b/docs/changelog/changelog_dev.rst @@ -34,4 +34,4 @@ Development Changelog :tags: general, bugfix :pullreq: 426 - A bugfix for gracefully handling empty data_point_out.json files. \ No newline at end of file + A bugfix for gracefully handling empty data_point_out.json files. diff --git a/docs/changelog/migration_0_19.rst b/docs/changelog/migration_0_19.rst index 2e765707..d6cc956e 100644 --- a/docs/changelog/migration_0_19.rst +++ b/docs/changelog/migration_0_19.rst @@ -10,7 +10,7 @@ General ======= Changes between these versions focused on bugfixes, performance improvements, -and documentation updates. See :doc:`changelog_0_19` for details. +and documentation updates. See :doc:`changelog_0_19` for details. Schema Updates ============== diff --git a/docs/changelog/migration_0_20.rst b/docs/changelog/migration_0_20.rst index 00fc5a34..f3e03e06 100644 --- a/docs/changelog/migration_0_20.rst +++ b/docs/changelog/migration_0_20.rst @@ -173,7 +173,7 @@ Reporting Measures in Workflows ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The ``reporting_measures`` configuration key that now resides under ``workflow_generator.args`` -allows measure arguments to be passed to reporting measures. +allows measure arguments to be passed to reporting measures. Old Spec: diff --git a/docs/changelog/migration_0_21.rst b/docs/changelog/migration_0_21.rst index 0c996e9a..7bb195db 100644 --- a/docs/changelog/migration_0_21.rst +++ b/docs/changelog/migration_0_21.rst @@ -11,7 +11,7 @@ General ======= Changes between these versions focused on bugfixes, performance improvements, -and documentation updates. See :doc:`changelog_0_21` for details. +and documentation updates. See :doc:`changelog_0_21` for details. Schema Updates ============== diff --git a/docs/changelog/migration_2022_10_0.rst b/docs/changelog/migration_2022_10_0.rst index b0cbcfab..690f57c4 100644 --- a/docs/changelog/migration_2022_10_0.rst +++ b/docs/changelog/migration_2022_10_0.rst @@ -15,7 +15,7 @@ It has been a while since our last formal release. A lot has changed in both ResStock and ComStock in that time. This is the first release that formally supports the new HPXML based workflow for ResStock. It is also the first formal version that supports ComStock. (Prior to this ComStock was being run from a -separate branch.) +separate branch.) Most users have been using their own branches or environments built from ``develop`` to get the features they need. Hopefully this allows most users to diff --git a/docs/changelog/migration_2022_12_0.rst b/docs/changelog/migration_2022_12_0.rst index 713d08fa..cd4dd4b7 100644 --- a/docs/changelog/migration_2022_12_0.rst +++ b/docs/changelog/migration_2022_12_0.rst @@ -24,4 +24,4 @@ See :doc:`changelog_2022_12_0` for details of changes since the last version. Schema Updates ============== -No schema updates this time. +No schema updates this time. diff --git a/docs/changelog/migration_2023_01_0.rst b/docs/changelog/migration_2023_01_0.rst index d6878df3..65d177e6 100644 --- a/docs/changelog/migration_2023_01_0.rst +++ b/docs/changelog/migration_2023_01_0.rst @@ -27,4 +27,4 @@ Generator: ``timeseries_num_decimal_places`` and Schema Updates ============== -No schema updates this time. +No schema updates this time. diff --git a/docs/changelog/migration_2023_05_0.rst b/docs/changelog/migration_2023_05_0.rst index e98febc5..1abe872f 100644 --- a/docs/changelog/migration_2023_05_0.rst +++ b/docs/changelog/migration_2023_05_0.rst @@ -49,4 +49,4 @@ A ``references`` section can now be included in the project yaml file. This can be used to define other parts of the file that you want to reuse using yaml anchors and references. -``eagle.minutes_per_sim`` can now be extended up to 8 hours. \ No newline at end of file +``eagle.minutes_per_sim`` can now be extended up to 8 hours. diff --git a/docs/samplers/precomputed.rst b/docs/samplers/precomputed.rst index 3d589288..bbd29a2d 100644 --- a/docs/samplers/precomputed.rst +++ b/docs/samplers/precomputed.rst @@ -3,7 +3,7 @@ Precomputed Sampler The Precomputed Sampler provides a way to directly provide buildstockbatch a sample of buildings to simulate. This can be useful for a variety of cases, including where you previously ran sampling for ResStock or ComStock and want to rerun the same set of buildings with a different set of upgrades. -This sampler cannot be used with a downselect (i.e. there is no precomputed downselect sampler). To downselect the buildings in a precomputed sample, simply remove the buildings you don't want to run from the sample file (buildstock.csv). +This sampler cannot be used with a downselect (i.e. there is no precomputed downselect sampler). To downselect the buildings in a precomputed sample, simply remove the buildings you don't want to run from the sample file (buildstock.csv). Configuration Example ~~~~~~~~~~~~~~~~~~~~~ @@ -18,4 +18,4 @@ Configuration Example Arguments ~~~~~~~~~ -- ``sample_file``: A csv file containing the building sample--one row per building. The format is that the first column is the building_id, usually starting at one and incrementing from there, and following columns each represent a building characteristic. The characteristic columns expected depend on the workflow generator to be used (ResStock or ComStock). \ No newline at end of file +- ``sample_file``: A csv file containing the building sample--one row per building. The format is that the first column is the building_id, usually starting at one and incrementing from there, and following columns each represent a building characteristic. The characteristic columns expected depend on the workflow generator to be used (ResStock or ComStock). diff --git a/docs/samplers/residential_quota.rst b/docs/samplers/residential_quota.rst index e91f2aed..ffa8e6ed 100644 --- a/docs/samplers/residential_quota.rst +++ b/docs/samplers/residential_quota.rst @@ -1,7 +1,7 @@ Residential Quota Sampler ------------------------- -The Residential Quota sampler utilizes a `quota-based sampling method `_ to determine the buildings to simulate. It is the primary sampling algorithm used in ResStock. +The Residential Quota sampler utilizes a `quota-based sampling method `_ to determine the buildings to simulate. It is the primary sampling algorithm used in ResStock. Configuration Example ~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/samplers/residential_quota_downselect.rst b/docs/samplers/residential_quota_downselect.rst index d0f9b267..fb0daa81 100644 --- a/docs/samplers/residential_quota_downselect.rst +++ b/docs/samplers/residential_quota_downselect.rst @@ -6,7 +6,7 @@ included in a project. For instance one might want to run the simulation only in one climate region or for certain vintages. However, it can be a considerable effort to create a new project and modify the housing characteristic distributions. The Residential Quota Downselect sampler adds a downselection -capability to the :doc:`residential_quota`. +capability to the :doc:`residential_quota`. Downselecting can be performed in one of two ways: with and without resampling. Downselecting with resampling samples twice, once to determine how much smaller @@ -16,7 +16,7 @@ specified in ``n_datapoints``. Downselecting without resampling skips that step. In this case the total sampled buildings returned will be the number left over after sampling the entire stock -and then filtering down to the buildings that meet the criteria. +and then filtering down to the buildings that meet the criteria. diff --git a/docs/workflow_generators/index.rst b/docs/workflow_generators/index.rst index ba0a50c7..d23eacc7 100644 --- a/docs/workflow_generators/index.rst +++ b/docs/workflow_generators/index.rst @@ -11,4 +11,4 @@ Workflow Generators The ``residential_default`` workflow generator has been deprecated and removed. We recommend upgrading your project to the use latest - ``residential_hpxml`` workflow generator. \ No newline at end of file + ``residential_hpxml`` workflow generator. diff --git a/docs/workflow_generators/residential_hpxml.rst b/docs/workflow_generators/residential_hpxml.rst index e535f1b6..e5cefd25 100644 --- a/docs/workflow_generators/residential_hpxml.rst +++ b/docs/workflow_generators/residential_hpxml.rst @@ -107,7 +107,7 @@ Arguments - ``include_timeseries_total_loads``: Generates timeseries total heating, cooling, and hot water loads (in kBtu) for the building. - ``include_timeseries_component_loads``: Generates timeseries heating and cooling loads (in kBtu) disaggregated by component type (e.g., Walls, Windows, Infiltration, Ducts, etc.). - ``include_timeseries_unmet_hours``: Generates timeseries unmet hours for heating and cooling. - - ``include_timeseries_zone_temperatures``: Generates timeseries average temperatures (in deg-F) for each space modeled (e.g., living space, attic, garage, basement, crawlspace, etc.). + - ``include_timeseries_zone_temperatures``: Generates timeseries average temperatures (in deg-F) for each space modeled (e.g., living space, attic, garage, basement, crawlspace, etc.). - ``include_timeseries_airflows``: Generates timeseries airflow rates (in cfm) for infiltration, mechanical ventilation (including clothes dryer exhaust), natural ventilation, whole house fans. - ``include_timeseries_weather``: Generates timeseries weather file data including outdoor temperatures, relative humidity, wind speed, and solar. - ``include_timeseries_resilience``: Generates timeseries resilience outputs. From 1945e3f6b32eb2e02eb5adddefc26bad8f668254 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 14:15:11 -0700 Subject: [PATCH 38/53] running black again --- buildstockbatch/test/test_docker_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/buildstockbatch/test/test_docker_base.py b/buildstockbatch/test/test_docker_base.py index a6829da4..88e70f1a 100644 --- a/buildstockbatch/test/test_docker_base.py +++ b/buildstockbatch/test/test_docker_base.py @@ -1,4 +1,5 @@ """Tests for the DockerBatchBase class.""" + from fsspec.implementations.local import LocalFileSystem import gzip import json From f695f116d77935b86e7759c8e49feb42f273bb75 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 16:42:41 -0700 Subject: [PATCH 39/53] removing black config in pyproject.toml --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 55ec8d78..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,2 +0,0 @@ -[tool.black] -line-length = 120 From 511d733adcbfb4d2453f0f52c346ce5365a227b6 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 16:45:31 -0700 Subject: [PATCH 40/53] switching back to the default line length --- buildstockbatch/__version__.py | 4 +- buildstockbatch/aws/aws.py | 284 +++++++++++++----- buildstockbatch/aws/awsbase.py | 38 ++- buildstockbatch/base.py | 260 ++++++++++++---- buildstockbatch/cloud/docker_base.py | 49 ++- buildstockbatch/hpc.py | 178 ++++++++--- buildstockbatch/local.py | 75 +++-- buildstockbatch/postprocessing.py | 238 +++++++++++---- buildstockbatch/sampler/base.py | 12 +- buildstockbatch/sampler/commercial_sobol.py | 40 ++- buildstockbatch/sampler/downselect.py | 31 +- buildstockbatch/sampler/residential_quota.py | 9 +- buildstockbatch/test/conftest.py | 18 +- buildstockbatch/test/shared_testing_stuff.py | 8 +- buildstockbatch/test/test_base.py | 90 ++++-- buildstockbatch/test/test_docker_base.py | 44 ++- buildstockbatch/test/test_hpc.py | 150 +++++++-- buildstockbatch/test/test_local.py | 37 ++- buildstockbatch/test/test_postprocessing.py | 26 +- buildstockbatch/test/test_validation.py | 104 +++++-- buildstockbatch/utils.py | 29 +- .../workflow_generator/commercial.py | 22 +- .../workflow_generator/residential_hpxml.py | 69 +++-- .../test_workflow_generator.py | 129 ++++++-- docs/conf.py | 12 +- setup.py | 4 +- 26 files changed, 1502 insertions(+), 458 deletions(-) diff --git a/buildstockbatch/__version__.py b/buildstockbatch/__version__.py index 552e75f2..d32e165c 100644 --- a/buildstockbatch/__version__.py +++ b/buildstockbatch/__version__.py @@ -9,4 +9,6 @@ __author__ = "Noel Merket" __author_email__ = "noel.merket@nrel.gov" __license__ = "BSD-3" -__copyright__ = "Copyright {} The Alliance for Sustainable Energy".format(dt.date.today().year) +__copyright__ = "Copyright {} The Alliance for Sustainable Energy".format( + dt.date.today().year +) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index ea0a5e2a..a812d29f 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -62,7 +62,9 @@ def backoff(thefunc, *args, **kwargs): caught_error = False for pat in error_patterns: if re.search(pat, error_code): - logger.debug(f"{error_code}: Waiting and retrying in {delay} seconds") + logger.debug( + f"{error_code}: Waiting and retrying in {delay} seconds" + ) caught_error = True time.sleep(delay) delay *= backoff_mult @@ -88,7 +90,9 @@ def filename_generator(): if filename.startswith("."): continue local_filepath = pathlib.Path(dirpath, filename) - s3_key = pathlib.PurePosixPath(prefix, local_filepath.relative_to(local_dir_abs)) + s3_key = pathlib.PurePosixPath( + prefix, local_filepath.relative_to(local_dir_abs) + ) yield local_filepath, s3_key logger.debug("Uploading {} => {}/{}".format(local_dir_abs, bucket, prefix)) @@ -131,7 +135,9 @@ def __init__(self, job_name, aws_config, boto3_session): self.batch = self.session.client("batch", config=boto_client_config) self.ec2 = self.session.client("ec2", config=boto_client_config) self.ec2r = self.session.resource("ec2", config=boto_client_config) - self.step_functions = self.session.client("stepfunctions", config=boto_client_config) + self.step_functions = self.session.client( + "stepfunctions", config=boto_client_config + ) self.aws_lambda = self.session.client("lambda", config=boto_client_config) self.s3 = self.session.client("s3", config=boto_client_config) self.s3_res = self.session.resource("s3", config=boto_client_config) @@ -266,7 +272,9 @@ def create_vpc(self): # Create the public subnet - pub_response = self.ec2.create_subnet(CidrBlock=self.pub_subnet_cidr, VpcId=self.vpc_id) + pub_response = self.ec2.create_subnet( + CidrBlock=self.pub_subnet_cidr, VpcId=self.vpc_id + ) logger.info("EIP allocated.") @@ -298,7 +306,9 @@ def create_vpc(self): # Create an internet gateway - self.ec2.attach_internet_gateway(InternetGatewayId=self.internet_gateway_id, VpcId=self.vpc_id) + self.ec2.attach_internet_gateway( + InternetGatewayId=self.internet_gateway_id, VpcId=self.vpc_id + ) logger.info("Internet Gateway attached.") @@ -333,7 +343,9 @@ def create_vpc(self): # Create a NAT Gateway - nat_response = self.ec2.create_nat_gateway(AllocationId=self.nat_ip_allocation, SubnetId=self.pub_vpc_subnet_id) + nat_response = self.ec2.create_nat_gateway( + AllocationId=self.nat_ip_allocation, SubnetId=self.pub_vpc_subnet_id + ) self.nat_gateway_id = nat_response["NatGateway"]["NatGatewayId"] @@ -361,10 +373,14 @@ def create_vpc(self): # Associate the private route to the private subnet - self.ec2.associate_route_table(RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_1) + self.ec2.associate_route_table( + RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_1 + ) logger.info("Route table associated with subnet.") - self.ec2.associate_route_table(RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_2) + self.ec2.associate_route_table( + RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_2 + ) logger.info("Route table associated with subnet.") # Associate the NAT gateway with the private route @@ -423,7 +439,9 @@ def create_batch_service_roles(self): self.batch_service_role_name, "batch", f"Service role for Batch environment {self.job_identifier}", - managed_policie_arns=["arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole"], + managed_policie_arns=[ + "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole" + ], ) # Instance Role for Batch compute environment @@ -432,13 +450,17 @@ def create_batch_service_roles(self): self.batch_instance_role_name, "ec2", f"Instance role for Batch compute environment {self.job_identifier}", - managed_policie_arns=["arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"], + managed_policie_arns=[ + "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" + ], ) # Instance Profile try: - response = self.iam.create_instance_profile(InstanceProfileName=self.batch_instance_profile_name) + response = self.iam.create_instance_profile( + InstanceProfileName=self.batch_instance_profile_name + ) self.instance_profile_arn = response["InstanceProfile"]["Arn"] @@ -452,7 +474,9 @@ def create_batch_service_roles(self): except Exception as e: if "EntityAlreadyExists" in str(e): logger.info("ECS Instance Profile not created - already exists") - response = self.iam.get_instance_profile(InstanceProfileName=self.batch_instance_profile_name) + response = self.iam.get_instance_profile( + InstanceProfileName=self.batch_instance_profile_name + ) self.instance_profile_arn = response["InstanceProfile"]["Arn"] # ECS Task Policy @@ -557,7 +581,9 @@ def create_batch_service_roles(self): self.batch_spot_service_role_name, "spotfleet", f"Spot Fleet role for Batch compute environment {self.job_identifier}", - managed_policie_arns=["arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole"], + managed_policie_arns=[ + "arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole" + ], ) def create_compute_environment(self, maxCPUs=10000): @@ -582,13 +608,18 @@ def create_compute_environment(self, maxCPUs=10000): }, ) except ClientError as error: - if error.response["Error"]["Code"] == "InvalidLaunchTemplateName.AlreadyExistsException": + if ( + error.response["Error"]["Code"] + == "InvalidLaunchTemplateName.AlreadyExistsException" + ): logger.debug("Launch template exists, skipping creation") else: raise error while True: - lt_resp = self.ec2.describe_launch_templates(LaunchTemplateNames=[self.launch_template_name]) + lt_resp = self.ec2.describe_launch_templates( + LaunchTemplateNames=[self.launch_template_name] + ) launch_templates = lt_resp["LaunchTemplates"] next_token = lt_resp.get("NextToken") while next_token: @@ -599,9 +630,13 @@ def create_compute_environment(self, maxCPUs=10000): launch_templates.extend(lt_resp["LaunchTemplates"]) next_token = lt_resp.get("NextToken") n_launch_templates = len(launch_templates) - assert n_launch_templates <= 1, f"There are {n_launch_templates} launch templates, this shouldn't happen." + assert ( + n_launch_templates <= 1 + ), f"There are {n_launch_templates} launch templates, this shouldn't happen." if n_launch_templates == 0: - logger.debug(f"Waiting for the launch template {self.launch_template_name} to be created") + logger.debug( + f"Waiting for the launch template {self.launch_template_name} to be created" + ) time.sleep(5) if n_launch_templates == 1: break @@ -633,7 +668,9 @@ def create_compute_environment(self, maxCPUs=10000): else: compute_resources["type"] = "EC2" - compute_resources["tags"] = self.get_tags(Name=f"{self.job_identifier} batch instance") + compute_resources["tags"] = self.get_tags( + Name=f"{self.job_identifier} batch instance" + ) self.batch.create_compute_environment( computeEnvironmentName=self.batch_compute_environment_name, @@ -644,11 +681,15 @@ def create_compute_environment(self, maxCPUs=10000): tags=self.get_tags(), ) - logger.info(f"Compute environment {self.batch_compute_environment_name} created.") + logger.info( + f"Compute environment {self.batch_compute_environment_name} created." + ) except Exception as e: if "Object already exists" in str(e): - logger.info(f"Compute environment {self.batch_compute_environment_name} not created - already exists") + logger.info( + f"Compute environment {self.batch_compute_environment_name} not created - already exists" + ) else: raise @@ -680,7 +721,9 @@ def create_job_queue(self): except Exception as e: if "Object already exists" in str(e): - logger.info(f"Job queue {self.batch_job_queue_name} not created - already exists") + logger.info( + f"Job queue {self.batch_job_queue_name} not created - already exists" + ) response = self.batch.describe_job_queues( jobQueues=[ self.batch_job_queue_name, @@ -691,7 +734,10 @@ def create_job_queue(self): elif "is not valid" in str(e): # Need to wait a second for the compute environment to complete registration - logger.warning("wating a few seconds for compute environment creation: " + str(e)) + logger.warning( + "wating a few seconds for compute environment creation: " + + str(e) + ) time.sleep(5) else: @@ -747,7 +793,10 @@ def submit_job(self, array_size=4): except Exception as e: if "not in VALID state" in str(e): # Need to wait a second for the compute environment to complete registration - logger.warning("5 second sleep initiated to wait for job queue creation due to error: " + str(e)) + logger.warning( + "5 second sleep initiated to wait for job queue creation due to error: " + + str(e) + ) time.sleep(5) else: raise @@ -787,25 +836,35 @@ def clean(self): default_group_id = group["GroupId"] dsg = self.ec2r.SecurityGroup(default_group_id) if len(dsg.ip_permissions_egress): - response = dsg.revoke_egress(IpPermissions=dsg.ip_permissions_egress) + response = dsg.revoke_egress( + IpPermissions=dsg.ip_permissions_egress + ) try: - self.batch.update_job_queue(jobQueue=self.batch_job_queue_name, state="DISABLED") + self.batch.update_job_queue( + jobQueue=self.batch_job_queue_name, state="DISABLED" + ) while True: try: - response = self.batch.delete_job_queue(jobQueue=self.batch_job_queue_name) + response = self.batch.delete_job_queue( + jobQueue=self.batch_job_queue_name + ) logger.info(f"Job queue {self.batch_job_queue_name} deleted.") break except Exception as e: if "Cannot delete, resource is being modified" in str(e): - logger.info("Job queue being modified - sleeping until ready...") + logger.info( + "Job queue being modified - sleeping until ready..." + ) time.sleep(5) else: raise except Exception as e: if "does not exist" in str(e): - logger.info(f"Job queue {self.batch_job_queue_name} missing, skipping...") + logger.info( + f"Job queue {self.batch_job_queue_name} missing, skipping..." + ) # Delete compute enviornment @@ -819,26 +878,38 @@ def clean(self): response = self.batch.delete_compute_environment( computeEnvironment=self.batch_compute_environment_name ) - logger.info(f"Compute environment {self.batch_compute_environment_name} deleted.") + logger.info( + f"Compute environment {self.batch_compute_environment_name} deleted." + ) break except Exception as e: - if "Cannot delete, resource is being modified" in str(e) or "found existing JobQueue" in str(e): - logger.info("Compute environment being modified - sleeping until ready...") + if "Cannot delete, resource is being modified" in str( + e + ) or "found existing JobQueue" in str(e): + logger.info( + "Compute environment being modified - sleeping until ready..." + ) time.sleep(5) else: raise except Exception as e: if "does not exist" in str(e): - logger.info(f"Compute environment {self.batch_compute_environment_name} missing, skipping...") + logger.info( + f"Compute environment {self.batch_compute_environment_name} missing, skipping..." + ) else: raise # Delete Launch Template try: - self.ec2.delete_launch_template(LaunchTemplateName=self.launch_template_name) + self.ec2.delete_launch_template( + LaunchTemplateName=self.launch_template_name + ) except Exception as e: if "does not exist" in str(e): - logger.info(f"Launch template {self.launch_template_name} does not exist, skipping...") + logger.info( + f"Launch template {self.launch_template_name} does not exist, skipping..." + ) else: raise @@ -846,7 +917,9 @@ def clean(self): self.iam_helper.delete_role(self.batch_spot_service_role_name) self.iam_helper.delete_role(self.batch_ecs_task_role_name) # Instance profile order of removal - self.iam_helper.remove_role_from_instance_profile(self.batch_instance_profile_name) + self.iam_helper.remove_role_from_instance_profile( + self.batch_instance_profile_name + ) self.iam_helper.delete_role(self.batch_instance_role_name) self.iam_helper.delete_instance_profile(self.batch_instance_profile_name) @@ -866,7 +939,9 @@ def clean(self): for vpc in response["Vpcs"]: this_vpc = vpc["VpcId"] - s3gw_response = self.ec2.describe_vpc_endpoints(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) + s3gw_response = self.ec2.describe_vpc_endpoints( + Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] + ) for s3gw in s3gw_response["VpcEndpoints"]: this_s3gw = s3gw["VpcEndpointId"] @@ -874,7 +949,9 @@ def clean(self): if s3gw["State"] != "deleted": self.ec2.delete_vpc_endpoints(VpcEndpointIds=[this_s3gw]) - ng_response = self.ec2.describe_nat_gateways(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) + ng_response = self.ec2.describe_nat_gateways( + Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] + ) for natgw in ng_response["NatGateways"]: this_natgw = natgw["NatGatewayId"] @@ -882,7 +959,9 @@ def clean(self): if natgw["State"] != "deleted": self.ec2.delete_nat_gateway(NatGatewayId=this_natgw) - rtas_response = self.ec2.describe_route_tables(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) + rtas_response = self.ec2.describe_route_tables( + Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] + ) for route_table in rtas_response["RouteTables"]: route_table_id = route_table["RouteTableId"] @@ -894,7 +973,9 @@ def clean(self): rt_counter = 10 while rt_counter: try: - response = self.ec2.delete_route_table(RouteTableId=route_table_id) + response = self.ec2.delete_route_table( + RouteTableId=route_table_id + ) logger.info("Route table removed.") break except Exception as e: @@ -918,14 +999,20 @@ def clean(self): try: try: self.ec2.detach_internet_gateway( - InternetGatewayId=internet_gateway["InternetGatewayId"], + InternetGatewayId=internet_gateway[ + "InternetGatewayId" + ], VpcId=attachment["VpcId"], ) except Exception as e: - logger.info(f"Error on Internet Gateway disassociation - ignoring... {str(e)}") + logger.info( + f"Error on Internet Gateway disassociation - ignoring... {str(e)}" + ) self.ec2.delete_internet_gateway( - InternetGatewayId=internet_gateway["InternetGatewayId"] + InternetGatewayId=internet_gateway[ + "InternetGatewayId" + ] ) logger.info("Internet Gateway deleted.") break @@ -939,7 +1026,9 @@ def clean(self): else: raise - subn_response = self.ec2.describe_subnets(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) + subn_response = self.ec2.describe_subnets( + Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] + ) for subnet in subn_response["Subnets"]: while True: @@ -948,7 +1037,9 @@ def clean(self): break except Exception as e: if "DependencyViolation" in str(e): - logger.info("Subnet cannot be deleted as dependencies are still being deleted. Sleeping...") + logger.info( + "Subnet cannot be deleted as dependencies are still being deleted. Sleeping..." + ) time.sleep(10) else: raise @@ -984,11 +1075,15 @@ class AwsBatch(DockerBatchBase): def __init__(self, project_filename): super().__init__(project_filename) - self.job_identifier = re.sub("[^0-9a-zA-Z]+", "_", self.cfg["aws"]["job_identifier"])[:10] + self.job_identifier = re.sub( + "[^0-9a-zA-Z]+", "_", self.cfg["aws"]["job_identifier"] + )[:10] self.project_filename = project_filename self.region = self.cfg["aws"]["region"] - self.ecr = boto3.client("ecr", region_name=self.region, config=boto_client_config) + self.ecr = boto3.client( + "ecr", region_name=self.region, config=boto_client_config + ) self.s3 = boto3.client("s3", region_name=self.region, config=boto_client_config) self.s3_bucket = self.cfg["aws"]["s3"]["bucket"] self.s3_bucket_prefix = self.cfg["aws"]["s3"]["prefix"].rstrip("/") @@ -1000,7 +1095,9 @@ def __init__(self, project_filename): def validate_dask_settings(project_file): cfg = get_project_configuration(project_file) if "emr" in cfg["aws"]: - logger.warning("The `aws.emr` configuration is no longer used and is ignored. Recommend removing.") + logger.warning( + "The `aws.emr` configuration is no longer used and is ignored. Recommend removing." + ) dask_cfg = cfg["aws"]["dask"] errors = [] mem_rules = { @@ -1013,16 +1110,22 @@ def validate_dask_settings(project_file): for node_type in ("scheduler", "worker"): mem = dask_cfg.get(f"{node_type}_memory", 8 * 1024) if mem % 1024 != 0: - errors.append(f"`aws.dask.{node_type}_memory` = {mem}, needs to be a multiple of 1024.") + errors.append( + f"`aws.dask.{node_type}_memory` = {mem}, needs to be a multiple of 1024." + ) mem_gb = mem // 1024 - min_gb, max_gb, incr_gb = mem_rules[dask_cfg.get(f"{node_type}_cpu", 2 * 1024)] + min_gb, max_gb, incr_gb = mem_rules[ + dask_cfg.get(f"{node_type}_cpu", 2 * 1024) + ] if not (min_gb <= mem_gb <= max_gb and (mem_gb - min_gb) % incr_gb == 0): errors.append( f"`aws.dask.{node_type}_memory` = {mem}, " f"should be between {min_gb * 1024} and {max_gb * 1024} in a multiple of {incr_gb * 1024}." ) if errors: - errors.append("See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html") + errors.append( + "See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html" + ) raise ValidationError("\n".join(errors)) return True @@ -1071,22 +1174,32 @@ def build_image(self): """ root_path = pathlib.Path(os.path.abspath(__file__)).parent.parent.parent if not (root_path / "Dockerfile").exists(): - raise RuntimeError(f"The needs to be run from the root of the repo, found {root_path}") + raise RuntimeError( + f"The needs to be run from the root of the repo, found {root_path}" + ) # Make the buildstock/resources/.aws_docker_image dir to store logs - local_log_dir = os.path.join(self.buildstock_dir, "resources", ".aws_docker_image") + local_log_dir = os.path.join( + self.buildstock_dir, "resources", ".aws_docker_image" + ) if not os.path.exists(local_log_dir): os.makedirs(local_log_dir) # Determine whether or not to build the image with custom gems bundled in if self.cfg.get("baseline", dict()).get("custom_gems", False): # Ensure the custom Gemfile exists in the buildstock dir - local_gemfile_path = os.path.join(self.buildstock_dir, "resources", "Gemfile") + local_gemfile_path = os.path.join( + self.buildstock_dir, "resources", "Gemfile" + ) if not os.path.exists(local_gemfile_path): - raise AttributeError(f"baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}") + raise AttributeError( + f"baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}" + ) # Copy the custom Gemfile into the buildstockbatch repo - bsb_root = os.path.join(os.path.abspath(__file__), os.pardir, os.pardir, os.pardir) + bsb_root = os.path.join( + os.path.abspath(__file__), os.pardir, os.pardir, os.pardir + ) new_gemfile_path = os.path.join(bsb_root, "Gemfile") shutil.copyfile(local_gemfile_path, new_gemfile_path) logger.info(f"Copying custom Gemfile from {local_gemfile_path}") @@ -1099,7 +1212,9 @@ def build_image(self): # which stops before bundling custom gems into the image stage = "buildstockbatch" - logger.info(f"Building docker image stage: {stage} from OpenStudio {self.os_version}") + logger.info( + f"Building docker image stage: {stage} from OpenStudio {self.os_version}" + ) img, build_logs = self.docker_client.images.build( path=str(root_path), tag=self.docker_image, @@ -1151,16 +1266,22 @@ def push_image(self): """ auth_token = self.ecr.get_authorization_token() dkr_user, dkr_pass = ( - base64.b64decode(auth_token["authorizationData"][0]["authorizationToken"]).decode("ascii").split(":") + base64.b64decode(auth_token["authorizationData"][0]["authorizationToken"]) + .decode("ascii") + .split(":") ) repo_url = self.container_repo["repositoryUri"] registry_url = "https://" + repo_url.split("/")[0] - resp = self.docker_client.login(username=dkr_user, password=dkr_pass, registry=registry_url) + resp = self.docker_client.login( + username=dkr_user, password=dkr_pass, registry=registry_url + ) logger.debug(resp) image = self.docker_client.images.get(self.docker_image) image.tag(repo_url, tag=self.job_identifier) last_status = None - for x in self.docker_client.images.push(repo_url, tag=self.job_identifier, stream=True): + for x in self.docker_client.images.push( + repo_url, tag=self.job_identifier, stream=True + ): try: y = json.loads(x) except json.JSONDecodeError: @@ -1177,7 +1298,9 @@ def clean(self): """ logger.info("Beginning cleanup of AWS resources...") - batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) + batch_env = AwsBatchEnv( + self.job_identifier, self.cfg["aws"], self.boto3_session + ) batch_env.clean() def upload_batch_files_to_cloud(self, tmppath): @@ -1214,7 +1337,9 @@ def start_batch_job(self, batch_info): ) # Define the batch environment - batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) + batch_env = AwsBatchEnv( + self.job_identifier, self.cfg["aws"], self.boto3_session + ) logger.info( "Launching Batch environment - (resource configs will not be updated on subsequent executions, but new job revisions will be created):" # noqa 501 ) @@ -1246,14 +1371,18 @@ def start_batch_job(self, batch_info): # Monitor job status n_succeeded_last_time = 0 - with tqdm.tqdm(desc="Running Simulations", total=self.batch_array_size) as progress_bar: + with tqdm.tqdm( + desc="Running Simulations", total=self.batch_array_size + ) as progress_bar: job_status = None while job_status not in ("SUCCEEDED", "FAILED"): time.sleep(10) job_desc_resp = batch_env.batch.describe_jobs(jobs=[job_info["jobId"]]) job_status = job_desc_resp["jobs"][0]["status"] - jobs_resp = batch_env.batch.list_jobs(arrayJobId=job_info["jobId"], jobStatus="SUCCEEDED") + jobs_resp = batch_env.batch.list_jobs( + arrayJobId=job_info["jobId"], jobStatus="SUCCEEDED" + ) n_succeeded = len(jobs_resp["jobSummaryList"]) next_token = jobs_resp.get("nextToken") while next_token is not None: @@ -1302,7 +1431,9 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): jobs_file_path = sim_dir.parent / "jobs.tar.gz" s3.download_file(bucket, f"{prefix}/jobs.tar.gz", str(jobs_file_path)) with tarfile.open(jobs_file_path, "r") as tar_f: - jobs_d = json.load(tar_f.extractfile(f"jobs/job{job_id:05d}.json"), encoding="utf-8") + jobs_d = json.load( + tar_f.extractfile(f"jobs/job{job_id:05d}.json"), encoding="utf-8" + ) logger.debug("Number of simulations = {}".format(len(jobs_d["batch"]))) logger.debug("Getting weather files") @@ -1310,7 +1441,9 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): os.makedirs(weather_dir, exist_ok=True) # Make a lookup of which parameter points to the weather file from options_lookup.tsv - with open(sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8") as f: + with open( + sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8" + ) as f: tsv_reader = csv.reader(f, delimiter="\t") next(tsv_reader) # skip headers param_name = None @@ -1322,7 +1455,9 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): raise RuntimeError( f"The epw files are specified in options_lookup.tsv under more than one parameter type: {param_name}, {row[0]}" ) # noqa: E501 - epw_filename = row[row_has_epw.index(True) + 2].split("=")[1].split("/")[-1] + epw_filename = ( + row[row_has_epw.index(True) + 2].split("=")[1].split("/")[-1] + ) param_name = row[0] option_name = row[1] epws_by_option[option_name] = epw_filename @@ -1349,7 +1484,9 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): logger.debug("Extracting {}".format(epw_filename)) f_out.write(gzip.decompress(f_gz.getvalue())) - cls.run_simulations(cfg, job_id, jobs_d, sim_dir, S3FileSystem(), f"{bucket}/{prefix}") + cls.run_simulations( + cfg, job_id, jobs_d, sim_dir, S3FileSystem(), f"{bucket}/{prefix}" + ) def get_fs(self): return S3FileSystem() @@ -1357,7 +1494,9 @@ def get_fs(self): def get_dask_client(self): dask_cfg = self.cfg["aws"]["dask"] - batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) + batch_env = AwsBatchEnv( + self.job_identifier, self.cfg["aws"], self.boto3_session + ) m = 1024 self.dask_cluster = FargateCluster( region_name=self.region, @@ -1391,7 +1530,9 @@ def process_results(self, *args, **kwargs): cfg = deepcopy(self.cfg) container_buildstock_dir = str(container_workpath / "buildstock") cfg["buildstock_directory"] = container_buildstock_dir - cfg["project_directory"] = str(pathlib.Path(self.project_dir).relative_to(self.buildstock_dir)) + cfg["project_directory"] = str( + pathlib.Path(self.project_dir).relative_to(self.buildstock_dir) + ) with open(tmppath / "project_config.yml", "w") as f: f.write(yaml.dump(cfg, Dumper=yaml.SafeDumper)) @@ -1415,7 +1556,10 @@ def process_results(self, *args, **kwargs): ["python3", "-m", "buildstockbatch.aws.aws", container_cfg_path], volumes={ tmpdir: {"bind": str(container_workpath), "mode": "rw"}, - self.buildstock_dir: {"bind": container_buildstock_dir, "mode": "ro"}, + self.buildstock_dir: { + "bind": container_buildstock_dir, + "mode": "ro", + }, }, environment=env, name="bsb_post", diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index 7ecbf097..787edbff 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -67,7 +67,9 @@ def role_stitcher( p_counter = p_counter + 1 for managed_policy_arn in managed_policie_arns: - response = self.iam.attach_role_policy(PolicyArn=managed_policy_arn, RoleName=role_name) + response = self.iam.attach_role_policy( + PolicyArn=managed_policy_arn, RoleName=role_name + ) logger.info(f"Role {role_name} created") @@ -98,7 +100,9 @@ def delete_role(self, role_name): response = self.iam.list_attached_role_policies(RoleName=role_name) for policy in response["AttachedPolicies"]: - self.iam.detach_role_policy(RoleName=role_name, PolicyArn=policy["PolicyArn"]) + self.iam.detach_role_policy( + RoleName=role_name, PolicyArn=policy["PolicyArn"] + ) logger.info(f"Policies detached from role {role_name}.") @@ -116,13 +120,17 @@ def delete_instance_profile(self, instance_profile_name): logger.info(f"Instance profile {instance_profile_name} deleted.") except Exception as e: if "NoSuchEntity" in str(e): - logger.info(f"Instance profile {instance_profile_name} missing, skipping...") + logger.info( + f"Instance profile {instance_profile_name} missing, skipping..." + ) else: raise def remove_role_from_instance_profile(self, instance_profile_name): try: - response = self.iam.get_instance_profile(InstanceProfileName=instance_profile_name) + response = self.iam.get_instance_profile( + InstanceProfileName=instance_profile_name + ) for role in response["InstanceProfile"]["Roles"]: response = self.iam.remove_role_from_instance_profile( @@ -131,7 +139,9 @@ def remove_role_from_instance_profile(self, instance_profile_name): logger.info(f"Roles removed from instance profile {instance_profile_name}") except Exception as e: if "NoSuchEntity" in str(e): - logger.info(f"Instance profile {instance_profile_name} does not exist. Skipping...") + logger.info( + f"Instance profile {instance_profile_name} does not exist. Skipping..." + ) else: raise @@ -146,7 +156,11 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.iam = self.iam_helper.iam self.s3 = self.session.client("s3", config=boto_client_config) self.job_identifier = job_identifier - self.account = self.session.client("sts", config=boto_client_config).get_caller_identity().get("Account") + self.account = ( + self.session.client("sts", config=boto_client_config) + .get_caller_identity() + .get("Account") + ) self.region = aws_config["region"] self.operator_email = aws_config["notifications_email"] @@ -154,8 +168,12 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.s3_bucket = aws_config["s3"]["bucket"] self.s3_bucket_arn = f"arn:aws:s3:::{self.s3_bucket}" self.s3_bucket_prefix = aws_config["s3"]["prefix"].rstrip("/") - self.s3_lambda_code_emr_cluster_key = f"{self.s3_bucket_prefix}/lambda_functions/emr_function.py.zip" - self.s3_lambda_emr_config_key = f"{self.s3_bucket_prefix}/lambda_functions/emr_config.json" + self.s3_lambda_code_emr_cluster_key = ( + f"{self.s3_bucket_prefix}/lambda_functions/emr_function.py.zip" + ) + self.s3_lambda_emr_config_key = ( + f"{self.s3_bucket_prefix}/lambda_functions/emr_config.json" + ) self.s3_emr_folder_name = "emr" # Batch @@ -164,7 +182,9 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.batch_job_queue_name = f"job_queue_{self.job_identifier}" self.batch_service_role_name = f"batch_service_role_{self.job_identifier}" self.batch_instance_role_name = f"batch_instance_role_{self.job_identifier}" - self.batch_instance_profile_name = f"batch_instance_profile_{self.job_identifier}" + self.batch_instance_profile_name = ( + f"batch_instance_profile_{self.job_identifier}" + ) self.batch_spot_service_role_name = f"spot_fleet_role_{self.job_identifier}" self.batch_ecs_task_role_name = f"ecs_task_role_{self.job_identifier}" self.batch_task_policy_name = f"ecs_task_policy_{self.job_identifier}" diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index b6ded0a6..edafa73f 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -62,26 +62,37 @@ def __init__(self, project_filename): self.buildstock_dir = self.cfg["buildstock_directory"] if not os.path.isdir(self.buildstock_dir): - raise FileNotFoundError(f"buildstock_directory = {self.buildstock_dir} is not a directory.") - self.project_dir = os.path.join(self.buildstock_dir, self.cfg["project_directory"]) + raise FileNotFoundError( + f"buildstock_directory = {self.buildstock_dir} is not a directory." + ) + self.project_dir = os.path.join( + self.buildstock_dir, self.cfg["project_directory"] + ) if not os.path.isdir(self.project_dir): - raise FileNotFoundError(f"project_directory = {self.project_dir} is not a directory.") + raise FileNotFoundError( + f"project_directory = {self.project_dir} is not a directory." + ) # Load in OS_VERSION and OS_SHA arguments if they exist in the YAML, # otherwise use defaults specified here. self.os_version = self.cfg.get("os_version", self.DEFAULT_OS_VERSION) self.os_sha = self.cfg.get("os_sha", self.DEFAULT_OS_SHA) - logger.debug(f"Using OpenStudio version: {self.os_version} with SHA: {self.os_sha}") + logger.debug( + f"Using OpenStudio version: {self.os_version} with SHA: {self.os_sha}" + ) @staticmethod def get_sampler_class(sampler_name): - sampler_class_name = "".join(x.capitalize() for x in sampler_name.strip().split("_")) + "Sampler" + sampler_class_name = ( + "".join(x.capitalize() for x in sampler_name.strip().split("_")) + "Sampler" + ) return getattr(sampler, sampler_class_name) @staticmethod def get_workflow_generator_class(workflow_generator_name): workflow_generator_class_name = ( - "".join(x.capitalize() for x in workflow_generator_name.strip().split("_")) + "WorkflowGenerator" + "".join(x.capitalize() for x in workflow_generator_name.strip().split("_")) + + "WorkflowGenerator" ) return getattr(workflow_generator, workflow_generator_class_name) @@ -114,7 +125,9 @@ def _get_weather_files(self): f.write(chunk) f.seek(0) with zipfile.ZipFile(f, "r") as zf: - logger.debug("Extracting weather files to: {}".format(self.weather_dir)) + logger.debug( + "Extracting weather files to: {}".format(self.weather_dir) + ) zf.extractall(self.weather_dir) @property @@ -136,8 +149,12 @@ def skip_baseline_sims(self): @classmethod def get_reporting_measures(cls, cfg): - WorkflowGenerator = cls.get_workflow_generator_class(cfg["workflow_generator"]["type"]) - wg = WorkflowGenerator(cfg, 1) # Number of datapoints doesn't really matter here + WorkflowGenerator = cls.get_workflow_generator_class( + cfg["workflow_generator"]["type"] + ) + wg = WorkflowGenerator( + cfg, 1 + ) # Number of datapoints doesn't really matter here return wg.reporting_measures() def run_batch(self): @@ -145,7 +162,9 @@ def run_batch(self): @classmethod def create_osw(cls, cfg, n_datapoints, *args, **kwargs): - WorkflowGenerator = cls.get_workflow_generator_class(cfg["workflow_generator"]["type"]) + WorkflowGenerator = cls.get_workflow_generator_class( + cfg["workflow_generator"]["type"] + ) osw_generator = WorkflowGenerator(cfg, n_datapoints) return osw_generator.create_osw(*args, **kwargs) @@ -168,7 +187,9 @@ def make_sim_dir(building_id, upgrade_idx, base_dir, overwrite_existing=False): sim_dir, ) elif os.path.exists(os.path.join(sim_dir, "run", "failed.job")): - raise SimulationExists("{} exists and failed".format(sim_id), sim_id, sim_dir) + raise SimulationExists( + "{} exists and failed".format(sim_id), sim_id, sim_dir + ) else: shutil.rmtree(sim_dir) @@ -214,13 +235,21 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id): if os.path.isfile(timeseries_filepath): # Find the time columns present in the enduse_timeseries file possible_time_cols = ["time", "Time", "TimeDST", "TimeUTC"] - cols = read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist() + cols = read_csv( + timeseries_filepath, index_col=False, nrows=0 + ).columns.tolist() actual_time_cols = [c for c in cols if c in possible_time_cols] if not actual_time_cols: - logger.error(f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.") - raise RuntimeError(f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.") + logger.error( + f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}." + ) + raise RuntimeError( + f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}." + ) - tsdf = read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows) + tsdf = read_csv( + timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows + ) for col in actual_time_cols: tsdf[col] = tsdf[col].astype(pd.ArrowDtype(pa.timestamp("s"))) if os.path.isfile(schedules_filepath): @@ -290,7 +319,9 @@ def get_buildstock_dir(project_file, cfg): if os.path.isabs(buildstock_dir): return os.path.abspath(buildstock_dir) else: - return os.path.abspath(os.path.join(os.path.dirname(project_file), buildstock_dir)) + return os.path.abspath( + os.path.join(os.path.dirname(project_file), buildstock_dir) + ) @classmethod def validate_openstudio_path(cls, project_file): @@ -306,10 +337,14 @@ def validate_openstudio_path(cls, project_file): except FileNotFoundError: raise ValidationError(f"Cannot find openstudio at `{cls.openstudio_exe()}`") if proc_out.returncode != 0: - raise ValidationError(f"OpenStudio failed with the following error {proc_out.stderr}") + raise ValidationError( + f"OpenStudio failed with the following error {proc_out.stderr}" + ) actual_os_version, actual_os_sha = proc_out.stdout.strip().split("+") if os_version != actual_os_version: - raise ValidationError(f"OpenStudio version is {actual_os_version}, expected is {os_version}") + raise ValidationError( + f"OpenStudio version is {actual_os_version}, expected is {os_version}" + ) if os_sha != actual_os_sha: raise ValidationError( f"OpenStudio version is correct at {os_version}, but the shas don't match. " @@ -334,7 +369,9 @@ def validate_sampler(project_file): else: sample_file = os.path.abspath(sample_file) buildstock_df = read_csv(sample_file, dtype=str) - return BuildStockBatchBase.validate_buildstock_csv(project_file, buildstock_df) + return BuildStockBatchBase.validate_buildstock_csv( + project_file, buildstock_df + ) return True @staticmethod @@ -347,7 +384,9 @@ def validate_buildstock_csv(project_file, buildstock_df): if column in {"Building"}: continue if column not in param_option_dict: - errors.append(f"Column {column} in buildstock_csv is not available in options_lookup.tsv") + errors.append( + f"Column {column} in buildstock_csv is not available in options_lookup.tsv" + ) continue if "*" in param_option_dict[column]: continue # skip validating options when wildcard is present @@ -365,16 +404,22 @@ def validate_buildstock_csv(project_file, buildstock_df): @classmethod def validate_workflow_generator(cls, project_file): cfg = get_project_configuration(project_file) - WorkflowGenerator = cls.get_workflow_generator_class(cfg["workflow_generator"]["type"]) + WorkflowGenerator = cls.get_workflow_generator_class( + cfg["workflow_generator"]["type"] + ) return WorkflowGenerator.validate(cfg) @staticmethod def validate_project_schema(project_file): cfg = get_project_configuration(project_file) schema_version = cfg.get("schema_version") - version_schema = os.path.join(os.path.dirname(__file__), "schemas", f"v{schema_version}.yaml") + version_schema = os.path.join( + os.path.dirname(__file__), "schemas", f"v{schema_version}.yaml" + ) if not os.path.isfile(version_schema): - logger.error(f"Could not find validation schema for YAML version {schema_version}") + logger.error( + f"Could not find validation schema for YAML version {schema_version}" + ) raise FileNotFoundError(version_schema) schema = yamale.make_schema(version_schema) data = yamale.make_data(project_file, parser="ruamel") @@ -394,7 +439,9 @@ def validate_postprocessing_spec(project_file): partition_cols = cfg.get("postprocessing", {}).get("partition_columns", []) invalid_cols = [c for c in partition_cols if c not in param_option_dict.keys()] if invalid_cols: - raise ValidationError(f"The following partition columns are not valid: {invalid_cols}") + raise ValidationError( + f"The following partition columns are not valid: {invalid_cols}" + ) return True @staticmethod @@ -404,8 +451,12 @@ def validate_xor_nor_schema_keys(project_file): if int(major) >= 0: if int(minor) >= 0: # xor - if ("weather_files_url" in cfg.keys()) is ("weather_files_path" in cfg.keys()): - raise ValidationError("Both/neither weather_files_url and weather_files_path found in yaml root") + if ("weather_files_url" in cfg.keys()) is ( + "weather_files_path" in cfg.keys() + ): + raise ValidationError( + "Both/neither weather_files_url and weather_files_path found in yaml root" + ) return True @@ -420,7 +471,9 @@ def get_param_option_dict(project_file): try: with open(options_lookup_path, "r") as f: options = csv.DictReader(f, delimiter="\t") - invalid_options_lookup_str = "" # Holds option/parameter names with invalid characters + invalid_options_lookup_str = ( + "" # Holds option/parameter names with invalid characters + ) for row in options: for col in ["Parameter Name", "Option Name"]: invalid_chars = set(row[col]).intersection(set("|&()")) @@ -430,9 +483,16 @@ def get_param_option_dict(project_file): param_name, opt_name = row["Parameter Name"], row["Option Name"] param_option_dict[row["Parameter Name"]].add(row["Option Name"]) if opt_name == "*" and row["Measure Dir"]: - invalid_options_lookup_str += f"{param_name}: '*' cannot pass arguments to measure.\n" - if "*" in param_option_dict[param_name] and len(param_option_dict[param_name]) > 1: - invalid_options_lookup_str += f"{param_name}: '*' cannot be mixed with other options\n" + invalid_options_lookup_str += ( + f"{param_name}: '*' cannot pass arguments to measure.\n" + ) + if ( + "*" in param_option_dict[param_name] + and len(param_option_dict[param_name]) > 1 + ): + invalid_options_lookup_str += ( + f"{param_name}: '*' cannot be mixed with other options\n" + ) except FileNotFoundError as err: logger.error(f"Options lookup file not found at: '{options_lookup_path}'") raise err @@ -462,7 +522,9 @@ def get_errors(source_str, option_str): if not returns error message, close matches, and specifies where the error occurred (source_str) """ if "||" in option_str and "&&" in option_str: - invalid_option_spec_counter[(option_str, "has both || and && (not supported)")] += 1 + invalid_option_spec_counter[ + (option_str, "has both || and && (not supported)") + ] += 1 return "" if "||" in option_str or "&&" in option_str: @@ -470,7 +532,9 @@ def get_errors(source_str, option_str): errors = "" broken_options = option_str.split(splitter) if broken_options[-1] == "": - invalid_option_spec_counter[(option_str, "has trailing 'splitter'")] += 1 + invalid_option_spec_counter[ + (option_str, "has trailing 'splitter'") + ] += 1 return "" for broken_option_str in broken_options: new_source_str = source_str + f" in composite option '{option_str}'" @@ -492,15 +556,21 @@ def get_errors(source_str, option_str): return "" if parameter_name not in param_option_dict: - close_match = difflib.get_close_matches(parameter_name, param_option_dict.keys(), 1) + close_match = difflib.get_close_matches( + parameter_name, param_option_dict.keys(), 1 + ) close_match = close_match[0] if close_match else "" invalid_param_counter[(parameter_name, close_match)] += 1 return "" if not option_name or option_name not in param_option_dict[parameter_name]: - close_match = difflib.get_close_matches(option_name, list(param_option_dict[parameter_name]), 1) + close_match = difflib.get_close_matches( + option_name, list(param_option_dict[parameter_name]), 1 + ) close_match = close_match[0] if close_match else "" - invalid_option_counter_dict[parameter_name][(option_name, close_match)] += 1 + invalid_option_counter_dict[parameter_name][ + (option_name, close_match) + ] += 1 return "" return "" @@ -520,38 +590,62 @@ def get_all_option_str(source_str, inp): return [(source_str, inp)] elif type(inp) == list: return sum( - [get_all_option_str(source_str + f", in entry {count}", entry) for count, entry in enumerate(inp)], + [ + get_all_option_str(source_str + f", in entry {count}", entry) + for count, entry in enumerate(inp) + ], [], ) elif type(inp) == dict: if len(inp) > 1: - raise ValidationError(f"{source_str} the logic is malformed. Dict can't have more than one entry") + raise ValidationError( + f"{source_str} the logic is malformed. Dict can't have more than one entry" + ) source_str += f", in {list(inp.keys())[0]}" - return sum([get_all_option_str(source_str, i) for i in inp.values()], []) + return sum( + [get_all_option_str(source_str, i) for i in inp.values()], [] + ) # store all of the option_str in the project file as a list of (source_str, option_str) tuple source_option_str_list = [] if "upgrades" in cfg: for upgrade_count, upgrade in enumerate(cfg["upgrades"]): - upgrade_name = upgrade.get("upgrade_name", "") + f" (Upgrade Number: {upgrade_count})" + upgrade_name = ( + upgrade.get("upgrade_name", "") + + f" (Upgrade Number: {upgrade_count})" + ) source_str_upgrade = f"In upgrade '{upgrade_name}'" for option_count, option in enumerate(upgrade["options"]): - option_name = option.get("option", "") + f" (Option Number: {option_count})" - source_str_option = source_str_upgrade + f", in option '{option_name}'" - source_option_str_list.append((source_str_option, option.get("option"))) + option_name = ( + option.get("option", "") + f" (Option Number: {option_count})" + ) + source_str_option = ( + source_str_upgrade + f", in option '{option_name}'" + ) + source_option_str_list.append( + (source_str_option, option.get("option")) + ) if "apply_logic" in option: source_str_logic = source_str_option + ", in apply_logic" - source_option_str_list += get_all_option_str(source_str_logic, option["apply_logic"]) + source_option_str_list += get_all_option_str( + source_str_logic, option["apply_logic"] + ) if "package_apply_logic" in upgrade: source_str_package = source_str_upgrade + ", in package_apply_logic" - source_option_str_list += get_all_option_str(source_str_package, upgrade["package_apply_logic"]) + source_option_str_list += get_all_option_str( + source_str_package, upgrade["package_apply_logic"] + ) # TODO: refactor this into Sampler.validate_args if "downselect" in cfg or "downselect" in cfg.get("sampler", {}).get("type"): source_str = "In downselect" - logic = cfg["downselect"]["logic"] if "downselect" in cfg else cfg["sampler"]["args"]["logic"] + logic = ( + cfg["downselect"]["logic"] + if "downselect" in cfg + else cfg["sampler"]["args"]["logic"] + ) source_option_str_list += get_all_option_str(source_str, logic) # Gather all the errors in the option_str, if any @@ -560,7 +654,11 @@ def get_all_option_str(source_str, inp): error_message += get_errors(source_str, option_str) if error_message: - error_message = "Following option/parameter entries have problem:\n" + error_message + "\n" + error_message = ( + "Following option/parameter entries have problem:\n" + + error_message + + "\n" + ) if invalid_option_spec_counter: error_message += "* Following option/parameter entries have problem:\n" @@ -568,7 +666,9 @@ def get_all_option_str(source_str, inp): error_message += f" '{invalid_entry}' {error} - used '{count}' times\n" if invalid_param_counter: - error_message += "* Following parameters do not exist in options_lookup.tsv\n" + error_message += ( + "* Following parameters do not exist in options_lookup.tsv\n" + ) for (param, close_match), count in invalid_param_counter.items(): error_message += f" '{param}' - used '{count}' times." if close_match: @@ -640,7 +740,9 @@ def get_logic_problems(logic, parent=None): assert len(logic) == 1 for key, val in logic.items(): if key not in ["or", "and", "not"]: - raise ValidationError(f"Invalid key {key}. Only 'or', 'and' and 'not' is allowed.") + raise ValidationError( + f"Invalid key {key}. Only 'or', 'and' and 'not' is allowed." + ) return get_logic_problems(val, parent=key) elif isinstance(logic, str): if "&&" not in logic: @@ -648,19 +750,28 @@ def get_logic_problems(logic, parent=None): entries = logic.split("&&") return get_logic_problems(entries, parent="&&") else: - raise ValidationError(f"Invalid logic element {logic} with type {type(logic)}") + raise ValidationError( + f"Invalid logic element {logic} with type {type(logic)}" + ) all_problems = [] if "upgrades" in cfg: for upgrade_count, upgrade in enumerate(cfg["upgrades"]): upgrade_name = upgrade.get("upgrade_name", "") - source_str_upgrade = f"upgrade '{upgrade_name}' (Upgrade Number:{upgrade_count})" + source_str_upgrade = ( + f"upgrade '{upgrade_name}' (Upgrade Number:{upgrade_count})" + ) for option_count, option in enumerate(upgrade["options"]): option_name = option.get("option", "") - source_str_option = source_str_upgrade + f", option '{option_name}' (Option Number:{option_count})" + source_str_option = ( + source_str_upgrade + + f", option '{option_name}' (Option Number:{option_count})" + ) if "apply_logic" in option: if problems := get_logic_problems(option["apply_logic"]): - all_problems.append((source_str_option, problems, option["apply_logic"])) + all_problems.append( + (source_str_option, problems, option["apply_logic"]) + ) if "package_apply_logic" in upgrade: source_str_package = source_str_upgrade + ", in package_apply_logic" @@ -676,7 +787,11 @@ def get_logic_problems(logic, parent=None): # TODO: refactor this into Sampler.validate_args if "downselect" in cfg or "downselect" in cfg.get("sampler", {}).get("type"): source_str = "in downselect logic" - logic = cfg["downselect"]["logic"] if "downselect" in cfg else cfg["sampler"]["args"]["logic"] + logic = ( + cfg["downselect"]["logic"] + if "downselect" in cfg + else cfg["sampler"]["args"]["logic"] + ) if problems := get_logic_problems(logic): all_problems.append((source_str, problems, logic)) @@ -724,7 +839,10 @@ def get_errors(source_str, measure_str): """ if measure_str not in measure_dirs: closest = difflib.get_close_matches(measure_str, list(measure_dirs)) - return f"Measure directory {measure_str} not found. Closest matches: {closest}" f" {source_str}\n" + return ( + f"Measure directory {measure_str} not found. Closest matches: {closest}" + f" {source_str}\n" + ) return "" source_measures_str_list = [] @@ -741,7 +859,9 @@ def get_errors(source_str, measure_str): if not error_message: return True else: - error_message = "Measure name(s)/directory(ies) is(are) invalid. \n" + error_message + error_message = ( + "Measure name(s)/directory(ies) is(are) invalid. \n" + error_message + ) logger.error(error_message) raise ValidationError(error_message) @@ -784,7 +904,9 @@ def validate_resstock_or_comstock_version(project_file): """ cfg = get_project_configuration(project_file) - buildstock_rb = os.path.join(cfg["buildstock_directory"], "resources/buildstock.rb") + buildstock_rb = os.path.join( + cfg["buildstock_directory"], "resources/buildstock.rb" + ) if os.path.exists(buildstock_rb): with open(buildstock_rb, "r") as f: versions = dict( @@ -821,7 +943,9 @@ def validate_number_of_options(project_file): :rtype: bool """ cfg = get_project_configuration(project_file) - measure_xml_filename = os.path.join(cfg["buildstock_directory"], "measures", "ApplyUpgrade", "measure.xml") + measure_xml_filename = os.path.join( + cfg["buildstock_directory"], "measures", "ApplyUpgrade", "measure.xml" + ) if os.path.exists(measure_xml_filename): measure_xml_tree = objectify.parse(measure_xml_filename) measure_xml = measure_xml_tree.getroot() @@ -832,10 +956,14 @@ def validate_number_of_options(project_file): if m_option: option_number = int(m_option.group(1)) n_options_in_measure = max(option_number, n_options_in_measure) - m_costs = re.match(r"^option_(\d+)_cost_(\d+)_value", str(argument.name)) + m_costs = re.match( + r"^option_(\d+)_cost_(\d+)_value", str(argument.name) + ) if m_costs: cost_number = int(m_costs.group(2)) - n_costs_per_option_in_measure = max(cost_number, n_costs_per_option_in_measure) + n_costs_per_option_in_measure = max( + cost_number, n_costs_per_option_in_measure + ) n_options_in_cfg = 0 n_costs_in_cfg = 0 for upgrade in cfg.get("upgrades", []): @@ -914,14 +1042,22 @@ def process_results(self, skip_combine=False, use_dask_cluster=True): wfg_args = self.cfg["workflow_generator"].get("args", {}) if self.cfg["workflow_generator"]["type"] == "residential_hpxml": if "simulation_output_report" in wfg_args.keys(): - if "timeseries_frequency" in wfg_args["simulation_output_report"].keys(): - do_timeseries = wfg_args["simulation_output_report"]["timeseries_frequency"] != "none" + if ( + "timeseries_frequency" + in wfg_args["simulation_output_report"].keys() + ): + do_timeseries = ( + wfg_args["simulation_output_report"]["timeseries_frequency"] + != "none" + ) else: do_timeseries = "timeseries_csv_export" in wfg_args.keys() fs = self.get_fs() if not skip_combine: - postprocessing.combine_results(fs, self.results_dir, self.cfg, do_timeseries=do_timeseries) + postprocessing.combine_results( + fs, self.results_dir, self.cfg, do_timeseries=do_timeseries + ) aws_conf = self.cfg.get("postprocessing", {}).get("aws", {}) if "s3" in aws_conf or "aws" in self.cfg: diff --git a/buildstockbatch/cloud/docker_base.py b/buildstockbatch/cloud/docker_base.py index 3176a23a..613978a4 100644 --- a/buildstockbatch/cloud/docker_base.py +++ b/buildstockbatch/cloud/docker_base.py @@ -30,7 +30,13 @@ from buildstockbatch import postprocessing from buildstockbatch.base import BuildStockBatchBase -from buildstockbatch.utils import ContainerRuntime, calc_hash_for_file, compress_file, read_csv, get_bool_env_var +from buildstockbatch.utils import ( + ContainerRuntime, + calc_hash_for_file, + compress_file, + read_csv, + get_bool_env_var, +) logger = logging.getLogger(__name__) @@ -64,8 +70,12 @@ def __init__(self, project_filename): try: self.docker_client.ping() except: # noqa: E722 (allow bare except in this case because error can be a weird non-class Windows API error) - logger.error("The docker server did not respond, make sure Docker Desktop is started then retry.") - raise RuntimeError("The docker server did not respond, make sure Docker Desktop is started then retry.") + logger.error( + "The docker server did not respond, make sure Docker Desktop is started then retry." + ) + raise RuntimeError( + "The docker server did not respond, make sure Docker Desktop is started then retry." + ) @staticmethod def validate_project(project_file): @@ -192,10 +202,14 @@ def _prep_weather_files_for_batch(self, tmppath): self._get_weather_files() # Determine the unique weather files - epw_filenames = list(filter(lambda x: x.endswith(".epw"), os.listdir(self.weather_dir))) + epw_filenames = list( + filter(lambda x: x.endswith(".epw"), os.listdir(self.weather_dir)) + ) logger.info("Calculating hashes for weather files") epw_hashes = Parallel(n_jobs=-1, verbose=9)( - delayed(calc_hash_for_file)(pathlib.Path(self.weather_dir) / epw_filename) + delayed(calc_hash_for_file)( + pathlib.Path(self.weather_dir) / epw_filename + ) for epw_filename in epw_filenames ) # keep track of unique EPWs that may have dupes, and to compress and upload to cloud @@ -205,7 +219,9 @@ def _prep_weather_files_for_batch(self, tmppath): for epw_filename, epw_hash in zip(epw_filenames, epw_hashes): if bool(unique_epws[epw_hash]): # not the first file with this hash (it's a duplicate). add to ``epws_to_copy`` - epws_to_copy.append((unique_epws[epw_hash][0] + ".gz", epw_filename + ".gz")) + epws_to_copy.append( + (unique_epws[epw_hash][0] + ".gz", epw_filename + ".gz") + ) unique_epws[epw_hash].append(epw_filename) # Compress unique weather files and save to ``tmp_weather_out_path``, which will get @@ -230,7 +246,10 @@ def _prep_weather_files_for_batch(self, tmppath): total_count += count if count > 1: dupe_count += count - 1 - bytes = os.path.getsize(str(tmp_weather_out_path / epws[0]) + ".gz") * dupe_count + bytes = ( + os.path.getsize(str(tmp_weather_out_path / epws[0]) + ".gz") + * dupe_count + ) dupe_bytes = bytes * (count - 1) logger.info( f"Identified {dupe_count:,} duplicate weather files " @@ -257,7 +276,9 @@ def _prep_jobs_for_batch(self, tmppath): # Create list of (building ID, upgrade to apply) pairs for all simulations to run. baseline_sims = zip(building_ids, itertools.repeat(None)) - upgrade_sims = itertools.product(building_ids, range(len(self.cfg.get("upgrades", [])))) + upgrade_sims = itertools.product( + building_ids, range(len(self.cfg.get("upgrades", []))) + ) all_sims = list(itertools.chain(baseline_sims, upgrade_sims)) random.shuffle(all_sims) all_sims_iter = iter(all_sims) @@ -313,7 +334,9 @@ def _prep_jobs_for_batch(self, tmppath): "lib/housing_characteristics", ) - return DockerBatchBase.BatchInfo(n_sims=n_sims, n_sims_per_job=n_sims_per_job, job_count=job_count) + return DockerBatchBase.BatchInfo( + n_sims=n_sims, n_sims_per_job=n_sims_per_job, job_count=job_count + ) @classmethod def get_epws_to_download(cls, sim_dir, jobs_d): @@ -326,7 +349,9 @@ def get_epws_to_download(cls, sim_dir, jobs_d): :returns: Set of epw filenames needed for this batch of simulations. """ # Make a lookup of which parameter points to the weather file from options_lookup.tsv - with open(sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8") as f: + with open( + sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8" + ) as f: tsv_reader = csv.reader(f, delimiter="\t") next(tsv_reader) # skip headers param_name = None @@ -386,7 +411,9 @@ def run_simulations(cls, cfg, job_id, jobs_d, sim_dir, fs, output_path): sim_id = f"bldg{building_id:07d}up{upgrade_id:02d}" # Create OSW - osw = cls.create_osw(cfg, jobs_d["n_datapoints"], sim_id, building_id, upgrade_idx) + osw = cls.create_osw( + cfg, jobs_d["n_datapoints"], sim_id, building_id, upgrade_idx + ) with open(os.path.join(sim_dir, "in.osw"), "w") as f: json.dump(osw, f, indent=4) diff --git a/buildstockbatch/hpc.py b/buildstockbatch/hpc.py index ade93702..16efff9b 100644 --- a/buildstockbatch/hpc.py +++ b/buildstockbatch/hpc.py @@ -72,7 +72,9 @@ def __init__(self, project_filename): logger.debug("Output directory = {}".format(output_dir)) weather_dir = self.weather_dir # noqa E841 - self.apptainer_image = self.get_apptainer_image(self.cfg, self.os_version, self.os_sha) + self.apptainer_image = self.get_apptainer_image( + self.cfg, self.os_version, self.os_sha + ) @classmethod def validate_project(cls, project_file): @@ -95,7 +97,9 @@ def validate_apptainer_image_hpc(cls, project_file): @property def output_dir(self): - output_dir = path_rel_to_file(self.project_filename, self.cfg["output_directory"]) + output_dir = path_rel_to_file( + self.project_filename, self.cfg["output_directory"] + ) return output_dir @property @@ -114,11 +118,16 @@ def clear_and_copy_dir(src, dst): def get_apptainer_image(cls, cfg, os_version, os_sha): exts_to_try = ["Apptainer.sif", "Singularity.simg"] sys_img_dir = cfg.get("sys_image_dir", cls.DEFAULT_SYS_IMAGE_DIR) - image_paths = [pathlib.Path(sys_img_dir, f"OpenStudio-{os_version}.{os_sha}-{ext}") for ext in exts_to_try] + image_paths = [ + pathlib.Path(sys_img_dir, f"OpenStudio-{os_version}.{os_sha}-{ext}") + for ext in exts_to_try + ] for image_path in image_paths: if image_path.exists(): return str(image_path) - raise RuntimeError(f"Could not find apptainer image: {' or '.join(map(str, image_paths))}") + raise RuntimeError( + f"Could not find apptainer image: {' or '.join(map(str, image_paths))}" + ) @property def weather_dir(self): @@ -130,7 +139,12 @@ def weather_dir(self): def run_batch(self, sampling_only=False): # Create simulation_output dir - sim_out_ts_dir = pathlib.Path(self.output_dir) / "results" / "simulation_output" / "timeseries" + sim_out_ts_dir = ( + pathlib.Path(self.output_dir) + / "results" + / "simulation_output" + / "timeseries" + ) os.makedirs(sim_out_ts_dir, exist_ok=True) for i in range(0, len(self.cfg.get("upgrades", [])) + 1): os.makedirs(sim_out_ts_dir / f"up{i:02d}") @@ -140,7 +154,9 @@ def run_batch(self, sampling_only=False): destination_dir = os.path.dirname(self.sampler.csv_path) if os.path.exists(destination_dir): shutil.rmtree(destination_dir) - shutil.copytree(os.path.join(self.project_dir, "housing_characteristics"), destination_dir) + shutil.copytree( + os.path.join(self.project_dir, "housing_characteristics"), destination_dir + ) logger.debug("Housing characteristics copied.") # run sampling @@ -170,7 +186,9 @@ def run_batch(self, sampling_only=False): # larger than we need, now that we know n_sims n_sims_per_job = max(n_sims_per_job, self.MIN_SIMS_PER_JOB) - upgrade_sims = itertools.product(building_ids, range(len(self.cfg.get("upgrades", [])))) + upgrade_sims = itertools.product( + building_ids, range(len(self.cfg.get("upgrades", []))) + ) if not self.skip_baseline_sims: # create batches of simulations baseline_sims = zip(building_ids, itertools.repeat(None)) @@ -185,7 +203,9 @@ def run_batch(self, sampling_only=False): if not batch: break logger.info("Queueing job {} ({} simulations)".format(i, len(batch))) - job_json_filename = os.path.join(self.output_dir, "job{:03d}.json".format(i)) + job_json_filename = os.path.join( + self.output_dir, "job{:03d}.json".format(i) + ) with open(job_json_filename, "w") as f: json.dump( { @@ -213,7 +233,9 @@ def run_job_batch(self, job_array_number): pathlib.Path(self.buildstock_dir) / "measures", self.local_buildstock_dir / "measures", ) - if os.path.exists(pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures"): + if os.path.exists( + pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures" + ): self.clear_and_copy_dir( pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures", self.local_buildstock_dir / "resources/hpxml-measures", @@ -228,7 +250,9 @@ def run_job_batch(self, job_array_number): shutil.copy2(self.apptainer_image, self.local_apptainer_img) # Run the job batch as normal - job_json_filename = os.path.join(self.output_dir, "job{:03d}.json".format(job_array_number)) + job_json_filename = os.path.join( + self.output_dir, "job{:03d}.json".format(job_array_number) + ) with open(job_json_filename, "r") as f: args = json.load(f) @@ -246,12 +270,18 @@ def run_job_batch(self, job_array_number): df.to_csv(buildstock_csv_path, index=False) logger.debug(f"Buildstock.csv trimmed to {len(df)} rows.") - traceback_file_path = self.local_output_dir / "simulation_output" / f"traceback{job_array_number}.out" + traceback_file_path = ( + self.local_output_dir + / "simulation_output" + / f"traceback{job_array_number}.out" + ) @delayed def run_building_d(i, upgrade_idx): try: - return self.run_building(self.output_dir, self.cfg, args["n_datapoints"], i, upgrade_idx) + return self.run_building( + self.output_dir, self.cfg, args["n_datapoints"], i, upgrade_idx + ) except Exception: with open(traceback_file_path, "a") as f: txt = get_error_details() @@ -278,7 +308,9 @@ def run_building_d(i, upgrade_idx): # Compress simulation results if self.cfg.get("max_minutes_per_sim") is not None: time.sleep(60) # Allow results JSON to finish writing - simout_filename = lustre_sim_out_dir / f"simulations_job{job_array_number}.tar.gz" + simout_filename = ( + lustre_sim_out_dir / f"simulations_job{job_array_number}.tar.gz" + ) logger.info(f"Compressing simulation outputs to {simout_filename}") local_sim_out_dir = self.local_output_dir / "simulation_output" subprocess.run( @@ -325,17 +357,23 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1 try: - sim_id, sim_dir = cls.make_sim_dir(i, upgrade_idx, os.path.join(cls.local_output_dir, "simulation_output")) + sim_id, sim_dir = cls.make_sim_dir( + i, upgrade_idx, os.path.join(cls.local_output_dir, "simulation_output") + ) except SimulationExists as ex: sim_dir = ex.sim_dir else: # Generate the osw for this simulation - osw = cls.create_osw(cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx) + osw = cls.create_osw( + cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx + ) with open(os.path.join(sim_dir, "in.osw"), "w") as f: json.dump(osw, f, indent=4) # Create a temporary directory for the simulation to use - with tempfile.TemporaryDirectory(dir=cls.local_scratch, prefix=f"{sim_id}_") as tmpdir: + with tempfile.TemporaryDirectory( + dir=cls.local_scratch, prefix=f"{sim_id}_" + ) as tmpdir: # Build the command to instantiate and configure the apptainer container the simulation is run inside local_resources_dir = cls.local_buildstock_dir / "resources" args = [ @@ -365,11 +403,19 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): for src in dirs_to_mount: container_mount = "/" + src.name args.extend(["-B", "{}:{}:ro".format(src, container_mount)]) - container_symlink = pathlib.Path("/var/simdata/openstudio", src.name) - runscript.append("ln -s {} {}".format(*map(shlex.quote, (container_mount, str(container_symlink))))) + container_symlink = pathlib.Path( + "/var/simdata/openstudio", src.name + ) + runscript.append( + "ln -s {} {}".format( + *map(shlex.quote, (container_mount, str(container_symlink))) + ) + ) if (cls.local_buildstock_dir / "resources" / "hpxml-measures").exists(): - runscript.append("ln -s /resources /var/simdata/openstudio/resources") + runscript.append( + "ln -s /resources /var/simdata/openstudio/resources" + ) src = cls.local_buildstock_dir / "resources" / "hpxml-measures" container_mount = "/resources/hpxml-measures" args.extend(["-B", f"{src}:{container_mount}:ro"]) @@ -422,10 +468,18 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): "timeout": msg, } out_osw.write(json.dumps(out_msg, indent=3)) - with open(pathlib.Path(sim_dir, "run", "out.osw"), "a") as run_log: - run_log.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") - with open(pathlib.Path(sim_dir, "run", "failed.job"), "w") as failed_job: - failed_job.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") + with open( + pathlib.Path(sim_dir, "run", "out.osw"), "a" + ) as run_log: + run_log.write( + f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}" + ) + with open( + pathlib.Path(sim_dir, "run", "failed.job"), "w" + ) as failed_job: + failed_job.write( + f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}" + ) # Wait for EnergyPlus to release file locks and data_point.zip to finish time.sleep(60) except subprocess.CalledProcessError: @@ -434,7 +488,9 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): # Clean up the symbolic links we created in the container for mount_dir in dirs_to_mount + [pathlib.Path(sim_dir, "lib")]: try: - pathlib.Path(sim_dir, os.path.basename(mount_dir)).unlink() + pathlib.Path( + sim_dir, os.path.basename(mount_dir) + ).unlink() except FileNotFoundError: pass @@ -448,7 +504,9 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): ) reporting_measures = cls.get_reporting_measures(cfg) - dpout = postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, i) + dpout = postprocessing.read_simulation_outputs( + fs, reporting_measures, sim_dir, upgrade_id, i + ) return dpout @staticmethod @@ -464,12 +522,16 @@ def queue_sampling( hipri: bool, ): cfg = get_project_configuration(project_filename) - hpc_sh = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"{cls.HPC_NAME}.sh") + hpc_sh = os.path.join( + os.path.dirname(os.path.abspath(__file__)), f"{cls.HPC_NAME}.sh" + ) assert os.path.exists(hpc_sh) out_dir = cfg["output_directory"] if os.path.exists(out_dir): raise FileExistsError( - "The output directory {} already exists. Please delete it or choose another.".format(out_dir) + "The output directory {} already exists. Please delete it or choose another.".format( + out_dir + ) ) logger.info("Creating output directory {}".format(out_dir)) os.makedirs(out_dir) @@ -520,7 +582,9 @@ def queue_jobs(self, array_ids=None, hipri=False): # Estimate the wall time in minutes minutes_per_sim = hpc_cfg["minutes_per_sim"] - walltime = math.ceil(math.ceil(n_sims_per_job / self.CORES_PER_NODE) * minutes_per_sim) + walltime = math.ceil( + math.ceil(n_sims_per_job / self.CORES_PER_NODE) * minutes_per_sim + ) # Queue up simulations here = os.path.dirname(os.path.abspath(__file__)) @@ -575,10 +639,18 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False) hpc_cfg = self.cfg[self.HPC_NAME] account = hpc_cfg["account"] walltime = hpc_cfg.get("postprocessing", {}).get("time", "1:30:00") - memory = hpc_cfg.get("postprocessing", {}).get("node_memory_mb", self.DEFAULT_POSTPROCESSING_NODE_MEMORY_MB) - n_procs = hpc_cfg.get("postprocessing", {}).get("n_procs", self.DEFAULT_POSTPROCESSING_N_PROCS) - n_workers = hpc_cfg.get("postprocessing", {}).get("n_workers", self.DEFAULT_POSTPROCESSING_N_WORKERS) - print(f"Submitting job to {n_workers} {memory}MB memory nodes using {n_procs} cores in each.") + memory = hpc_cfg.get("postprocessing", {}).get( + "node_memory_mb", self.DEFAULT_POSTPROCESSING_NODE_MEMORY_MB + ) + n_procs = hpc_cfg.get("postprocessing", {}).get( + "n_procs", self.DEFAULT_POSTPROCESSING_N_PROCS + ) + n_workers = hpc_cfg.get("postprocessing", {}).get( + "n_workers", self.DEFAULT_POSTPROCESSING_N_WORKERS + ) + print( + f"Submitting job to {n_workers} {memory}MB memory nodes using {n_procs} cores in each." + ) # Throw an error if the files already exist. if not upload_only: @@ -601,7 +673,8 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False) last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath)) shutil.move( filepath, - filepath.parent / f"{filepath.stem}_{last_mod_date:%Y%m%d%H%M}{filepath.suffix}", + filepath.parent + / f"{filepath.stem}_{last_mod_date:%Y%m%d%H%M}{filepath.suffix}", ) env_export = { @@ -657,14 +730,22 @@ def get_dask_client(self): # from dask.distributed import LocalCluster # cluster = LocalCluster(local_directory="/tmp/scratch/dask", n_workers=90, memory_limit="16GiB") # return Client(cluster) - return Client(scheduler_file=os.path.join(self.output_dir, "dask_scheduler.json")) + return Client( + scheduler_file=os.path.join(self.output_dir, "dask_scheduler.json") + ) def process_results(self, *args, **kwargs): # Check that all the jobs succeeded before proceeding failed_job_array_ids = self.get_failed_job_array_ids() if failed_job_array_ids: - logger.error("The following simulation jobs failed: {}".format(", ".join(map(str, failed_job_array_ids)))) - logger.error("Please inspect those jobs and fix any problems before resubmitting.") + logger.error( + "The following simulation jobs failed: {}".format( + ", ".join(map(str, failed_job_array_ids)) + ) + ) + logger.error( + "Please inspect those jobs and fix any problems before resubmitting." + ) logger.critical("Postprocessing cancelled.") return False @@ -716,7 +797,8 @@ def rerun_failed_jobs(self, hipri=False): last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath)) shutil.move( filepath, - prev_failed_job_out_dir / f"{filepath.name}_{last_mod_date:%Y%m%d%H%M}", + prev_failed_job_out_dir + / f"{filepath.name}_{last_mod_date:%Y%m%d%H%M}", ) # Delete simulation results for jobs we're about to rerun @@ -752,7 +834,8 @@ def validate_output_directory_eagle(cls, project_file): output_dir = path_rel_to_file(project_file, cfg["output_directory"]) if not re.match(r"/(lustre/eaglefs/)?(scratch|projects)", output_dir): raise ValidationError( - f"`output_directory` must be in /scratch or /projects," f" `output_directory` = {output_dir}" + f"`output_directory` must be in /scratch or /projects," + f" `output_directory` = {output_dir}" ) @classmethod @@ -783,7 +866,8 @@ def validate_output_directory_kestrel(cls, project_file): output_dir = path_rel_to_file(project_file, cfg["output_directory"]) if not re.match(r"/(kfs\d/)?(scratch|projects)", output_dir): raise ValidationError( - f"`output_directory` must be in /scratch or /projects," f" `output_directory` = {output_dir}" + f"`output_directory` must be in /scratch or /projects," + f" `output_directory` = {output_dir}" ) @classmethod @@ -875,15 +959,21 @@ def user_cli(Batch: SlurmBatch, argv: list): help="Only validate the project YAML file and references. Nothing is executed", action="store_true", ) - group.add_argument("--samplingonly", help="Run the sampling only.", action="store_true") - group.add_argument("--rerun_failed", help="Rerun the failed jobs", action="store_true") + group.add_argument( + "--samplingonly", help="Run the sampling only.", action="store_true" + ) + group.add_argument( + "--rerun_failed", help="Rerun the failed jobs", action="store_true" + ) # parse CLI arguments args = parser.parse_args(argv) # load the yaml project file if not os.path.isfile(args.project_filename): - raise FileNotFoundError("The project file {} doesn't exist".format(args.project_filename)) + raise FileNotFoundError( + "The project file {} doesn't exist".format(args.project_filename) + ) project_filename = os.path.abspath(args.project_filename) # validate the project, and in case of the --validateonly flag return True if validation passes @@ -904,7 +994,9 @@ def user_cli(Batch: SlurmBatch, argv: list): # otherwise, queue up the whole buildstockbatch process # the main work of the first job is to run the sampling script ... - Batch.queue_sampling(project_filename, args.measuresonly, args.samplingonly, args.hipri) + Batch.queue_sampling( + project_filename, args.measuresonly, args.samplingonly, args.hipri + ) @log_error_details() diff --git a/buildstockbatch/local.py b/buildstockbatch/local.py index 1a47f36b..70a72025 100644 --- a/buildstockbatch/local.py +++ b/buildstockbatch/local.py @@ -46,7 +46,9 @@ def __init__(self, project_filename): self._weather_dir = None # Create simulation_output dir - sim_out_ts_dir = os.path.join(self.results_dir, "simulation_output", "timeseries") + sim_out_ts_dir = os.path.join( + self.results_dir, "simulation_output", "timeseries" + ) os.makedirs(sim_out_ts_dir, exist_ok=True) for i in range(0, len(self.cfg.get("upgrades", [])) + 1): os.makedirs(os.path.join(sim_out_ts_dir, f"up{i:02d}"), exist_ok=True) @@ -55,18 +57,26 @@ def __init__(self, project_filename): # FIXME: Get working without docker if self.cfg.get("baseline", dict()).get("custom_gems", False): # TODO: Fix this stuff to work without docker - logger.info("Installing custom gems to docker volume: buildstockbatch_custom_gems") + logger.info( + "Installing custom gems to docker volume: buildstockbatch_custom_gems" + ) docker_client = docker.client.from_env() # Create a volume to store the custom gems - docker_client.volumes.create(name="buildstockbatch_custom_gems", driver="local") - simdata_vol = docker_client.volumes.create(name="buildstockbatch_simdata_temp", driver="local") + docker_client.volumes.create( + name="buildstockbatch_custom_gems", driver="local" + ) + simdata_vol = docker_client.volumes.create( + name="buildstockbatch_simdata_temp", driver="local" + ) # Define directories to be mounted in the container mnt_gem_dir = "/var/oscli/gems" # Install custom gems to be used in the docker container - local_gemfile_path = os.path.join(self.buildstock_dir, "resources", "Gemfile") + local_gemfile_path = os.path.join( + self.buildstock_dir, "resources", "Gemfile" + ) mnt_gemfile_path_orig = "/var/oscli/gemfile/Gemfile" docker_volume_mounts = { "buildstockbatch_custom_gems": {"bind": mnt_gem_dir, "mode": "rw"}, @@ -77,10 +87,14 @@ def __init__(self, project_filename): # Check that the Gemfile exists if not os.path.exists(local_gemfile_path): print(f"local_gemfile_path = {local_gemfile_path}") - raise AttributeError("baseline:custom_gems = True, but did not find Gemfile in /resources directory") + raise AttributeError( + "baseline:custom_gems = True, but did not find Gemfile in /resources directory" + ) # Make the buildstock/resources/.custom_gems dir to store logs - local_log_dir = os.path.join(self.buildstock_dir, "resources", ".custom_gems") + local_log_dir = os.path.join( + self.buildstock_dir, "resources", ".custom_gems" + ) if not os.path.exists(local_log_dir): os.makedirs(local_log_dir) @@ -95,7 +109,9 @@ def __init__(self, project_filename): volumes=docker_volume_mounts, name="install_custom_gems", ) - with open(os.path.join(local_log_dir, "bundle_install_output.log"), "wb") as f_out: + with open( + os.path.join(local_log_dir, "bundle_install_output.log"), "wb" + ) as f_out: f_out.write(container_output) # Report out custom gems loaded by OpenStudio CLI @@ -144,25 +160,33 @@ def run_building( upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1 try: - sim_id, sim_dir = cls.make_sim_dir(i, upgrade_idx, os.path.join(results_dir, "simulation_output")) + sim_id, sim_dir = cls.make_sim_dir( + i, upgrade_idx, os.path.join(results_dir, "simulation_output") + ) except SimulationExists: return sim_path = pathlib.Path(sim_dir) buildstock_path = pathlib.Path(buildstock_dir) # Make symlinks to project and buildstock stuff - (sim_path / "measures").symlink_to(buildstock_path / "measures", target_is_directory=True) + (sim_path / "measures").symlink_to( + buildstock_path / "measures", target_is_directory=True + ) (sim_path / "lib").symlink_to(buildstock_path / "lib", target_is_directory=True) (sim_path / "weather").symlink_to(weather_dir, target_is_directory=True) hpxml_measures_path = buildstock_path / "resources" / "hpxml-measures" if hpxml_measures_path.exists(): resources_path = sim_path / "resources" resources_path.mkdir() - (resources_path / "hpxml-measures").symlink_to(hpxml_measures_path, target_is_directory=True) + (resources_path / "hpxml-measures").symlink_to( + hpxml_measures_path, target_is_directory=True + ) else: resources_path = None - osw = cls.create_osw(cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx) + osw = cls.create_osw( + cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx + ) with open(sim_path / "in.osw", "w") as f: json.dump(osw, f, indent=4) @@ -252,7 +276,9 @@ def run_building( # Read data_point_out.json reporting_measures = cls.get_reporting_measures(cfg) - dpout = postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, i) + dpout = postprocessing.read_simulation_outputs( + fs, reporting_measures, sim_dir, upgrade_id, i + ) return dpout def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): @@ -291,7 +317,9 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): ) upgrade_sims = [] for i in range(len(self.cfg.get("upgrades", []))): - upgrade_sims.append(map(functools.partial(run_building_d, upgrade_idx=i), building_ids)) + upgrade_sims.append( + map(functools.partial(run_building_d, upgrade_idx=i), building_ids) + ) if not self.skip_baseline_sims: baseline_sims = map(run_building_d, building_ids) all_sims = itertools.chain(baseline_sims, *upgrade_sims) @@ -325,14 +353,18 @@ def output_dir(self): @property def results_dir(self): - results_dir = self.cfg.get("output_directory", os.path.join(self.project_dir, "localResults")) + results_dir = self.cfg.get( + "output_directory", os.path.join(self.project_dir, "localResults") + ) results_dir = self.path_rel_to_projectfile(results_dir) if not os.path.isdir(results_dir): os.makedirs(results_dir) return results_dir def get_dask_client(self): - cluster = LocalCluster(local_directory=os.path.join(self.results_dir, "dask-tmp")) + cluster = LocalCluster( + local_directory=os.path.join(self.results_dir, "dask-tmp") + ) return Client(cluster) @@ -393,7 +425,8 @@ def main(): ) group.add_argument( "--uploadonly", - help="Only upload to S3, useful when postprocessing is already done. Ignores the " "upload flag in yaml", + help="Only upload to S3, useful when postprocessing is already done. Ignores the " + "upload flag in yaml", action="store_true", ) group.add_argument( @@ -401,10 +434,14 @@ def main(): help="Only validate the project YAML file and references. Nothing is executed", action="store_true", ) - group.add_argument("--samplingonly", help="Run the sampling only.", action="store_true") + group.add_argument( + "--samplingonly", help="Run the sampling only.", action="store_true" + ) args = parser.parse_args() if not os.path.isfile(args.project_filename): - raise FileNotFoundError(f"The project file {args.project_filename} doesn't exist") + raise FileNotFoundError( + f"The project file {args.project_filename} doesn't exist" + ) # Validate the project, and in case of the --validateonly flag return True if validation passes LocalBatch.validate_project(args.project_filename) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 0937185c..8138cee9 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -134,7 +134,9 @@ def read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, buildin :return: dpout [dict] """ - dpout = read_data_point_out_json(fs, reporting_measures, f"{sim_dir}/run/data_point_out.json") + dpout = read_data_point_out_json( + fs, reporting_measures, f"{sim_dir}/run/data_point_out.json" + ) if dpout is None: dpout = {} else: @@ -166,10 +168,15 @@ def clean_up_results_df(df, cfg, keep_upgrade_id=False): del results_df[col] for col in ("started_at", "completed_at"): if col in results_df.columns: - results_df[col] = pd.to_datetime(results_df[col], format="%Y%m%dT%H%M%SZ").astype( - pd.ArrowDtype(pa.timestamp("s")) - ) - reference_scenarios = dict([(i, x.get("reference_scenario")) for i, x in enumerate(cfg.get("upgrades", []), 1)]) + results_df[col] = pd.to_datetime( + results_df[col], format="%Y%m%dT%H%M%SZ" + ).astype(pd.ArrowDtype(pa.timestamp("s"))) + reference_scenarios = dict( + [ + (i, x.get("reference_scenario")) + for i, x in enumerate(cfg.get("upgrades", []), 1) + ] + ) results_df["apply_upgrade.reference_scenario"] = ( results_df["upgrade"].map(reference_scenarios).fillna("").astype(str) ) @@ -189,10 +196,26 @@ def clean_up_results_df(df, cfg, keep_upgrade_id=False): if "job_id" in results_df.columns: first_few_cols.insert(2, "job_id") - build_existing_model_cols = sorted([col for col in results_df.columns if col.startswith("build_existing_model")]) - sim_output_report_cols = sorted([col for col in results_df.columns if col.startswith("simulation_output_report")]) - report_sim_output_cols = sorted([col for col in results_df.columns if col.startswith("report_simulation_output")]) - upgrade_costs_cols = sorted([col for col in results_df.columns if col.startswith("upgrade_costs")]) + build_existing_model_cols = sorted( + [col for col in results_df.columns if col.startswith("build_existing_model")] + ) + sim_output_report_cols = sorted( + [ + col + for col in results_df.columns + if col.startswith("simulation_output_report") + ] + ) + report_sim_output_cols = sorted( + [ + col + for col in results_df.columns + if col.startswith("report_simulation_output") + ] + ) + upgrade_costs_cols = sorted( + [col for col in results_df.columns if col.startswith("upgrade_costs")] + ) sorted_cols = ( first_few_cols + build_existing_model_cols @@ -259,7 +282,9 @@ def read_enduse_timeseries_parquet(fs, all_cols, src_path, bldg_id): return df -def concat_and_normalize(fs, all_cols, src_path, dst_path, partition_columns, indx, bldg_ids, partition_vals): +def concat_and_normalize( + fs, all_cols, src_path, dst_path, partition_columns, indx, bldg_ids, partition_vals +): dfs = [] for bldg_id in sorted(bldg_ids): df = read_enduse_timeseries_parquet(fs, all_cols, src_path, bldg_id) @@ -333,12 +358,22 @@ def get_partitioned_bldg_groups(partition_df, partition_columns, files_per_parti """ total_building = len(partition_df) if partition_columns: - bldg_id_list_df = partition_df.reset_index().groupby(partition_columns)["building_id"].apply(list) + bldg_id_list_df = ( + partition_df.reset_index() + .groupby(partition_columns)["building_id"] + .apply(list) + ) ngroups = len(bldg_id_list_df) bldg_id_list = bldg_id_list_df.sum() - nfiles_in_each_group = [nfiles for nfiles in bldg_id_list_df.map(lambda x: len(x))] - files_groups = [split_into_groups(n, files_per_partition) for n in nfiles_in_each_group] - flat_groups = [n for group in files_groups for n in group] # flatten list of list into a list (maintain order) + nfiles_in_each_group = [ + nfiles for nfiles in bldg_id_list_df.map(lambda x: len(x)) + ] + files_groups = [ + split_into_groups(n, files_per_partition) for n in nfiles_in_each_group + ] + flat_groups = [ + n for group in files_groups for n in group + ] # flatten list of list into a list (maintain order) else: # no partitioning by a column. Just put buildings into groups of files_per_partition ngroups = 1 @@ -378,7 +413,9 @@ def write_metadata_files(fs, parquet_root_dir, partition_columns): concat_files = fs.glob(glob_str) logger.info(f"Gathered {len(concat_files)} files. Now writing _metadata") parquet_root_dir = Path(parquet_root_dir).as_posix() - create_metadata_file(concat_files, root_dir=parquet_root_dir, engine="pyarrow", fs=fs) + create_metadata_file( + concat_files, root_dir=parquet_root_dir, engine="pyarrow", fs=fs + ) logger.info(f"_metadata file written to {parquet_root_dir}") @@ -412,7 +449,9 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): if not results_json_files: raise ValueError("No simulation results found to post-process.") - logger.info("Collecting all the columns and datatypes in results_job*.json.gz parquet files.") + logger.info( + "Collecting all the columns and datatypes in results_job*.json.gz parquet files." + ) all_schema_dict = ( db.from_sequence(results_json_files) .map(partial(get_schema_dict, fs)) @@ -421,10 +460,13 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ) logger.info(f"Got {len(all_schema_dict)} columns") all_results_cols = list(all_schema_dict.keys()) - all_schema_dict = {to_camelcase(key): value for key, value in all_schema_dict.items()} + all_schema_dict = { + to_camelcase(key): value for key, value in all_schema_dict.items() + } logger.info(f"Got this schema: {all_schema_dict}\n") delayed_results_dfs = [ - dask.delayed(partial(read_results_json, fs, all_cols=all_results_cols))(x) for x in results_json_files + dask.delayed(partial(read_results_json, fs, all_cols=all_results_cols))(x) + for x in results_json_files ] results_df = dd.from_delayed(delayed_results_dfs, verify_meta=False) @@ -437,15 +479,25 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ts_filenames = fs.ls(upgrade_folder) if ts_filenames: do_timeseries = True - logger.info(f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}.") + logger.info( + f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}." + ) files_bag = db.from_sequence(ts_filenames, partition_size=100) - all_ts_cols |= files_bag.map(partial(get_cols, fs)).fold(lambda x, y: x.union(y)).compute() + all_ts_cols |= ( + files_bag.map(partial(get_cols, fs)) + .fold(lambda x, y: x.union(y)) + .compute() + ) logger.info("Collected all the columns") else: - logger.info(f"There are no timeseries files for upgrade {Path(upgrade_folder).name}.") + logger.info( + f"There are no timeseries files for upgrade {Path(upgrade_folder).name}." + ) # Sort the columns - all_ts_cols_sorted = ["building_id"] + sorted(x for x in all_ts_cols if x.startswith("time")) + all_ts_cols_sorted = ["building_id"] + sorted( + x for x in all_ts_cols if x.startswith("time") + ) all_ts_cols.difference_update(all_ts_cols_sorted) all_ts_cols_sorted.extend(sorted(x for x in all_ts_cols if not x.endswith("]"))) all_ts_cols.difference_update(all_ts_cols_sorted) @@ -462,7 +514,9 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): df_partition_columns = [f"build_existing_model.{c}" for c in partition_columns] missing_cols = set(df_partition_columns) - set(all_schema_dict.keys()) if missing_cols: - raise ValueError(f"The following partitioning columns are not found in results.json: {missing_cols}") + raise ValueError( + f"The following partitioning columns are not found in results.json: {missing_cols}" + ) if partition_columns: logger.info(f"The timeseries files will be partitioned by {partition_columns}.") @@ -479,12 +533,16 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): schema = None partition_df = df[df_partition_columns].copy() partition_df.rename( - columns={df_c: c for df_c, c in zip(df_partition_columns, partition_columns)}, + columns={ + df_c: c for df_c, c in zip(df_partition_columns, partition_columns) + }, inplace=True, ) if upgrade_id > 0: # Remove building characteristics for upgrade scenarios. - cols_to_keep = list(filter(lambda x: not x.startswith("build_existing_model."), df.columns)) + cols_to_keep = list( + filter(lambda x: not x.startswith("build_existing_model."), df.columns) + ) df = df[cols_to_keep] null_cols = get_null_cols(df) # If certain column datatype is null (happens when it doesn't have any data), the datatype @@ -493,9 +551,13 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): logger.info(f"Upgrade {upgrade_id} has null cols: {null_cols}") schema, unresolved = correct_schema(all_schema_dict, df) if unresolved: - logger.info(f"The types for {unresolved} columns couldn't be determined.") + logger.info( + f"The types for {unresolved} columns couldn't be determined." + ) else: - logger.info("All columns were successfully assigned a datatype based on other upgrades.") + logger.info( + "All columns were successfully assigned a datatype based on other upgrades." + ) # Write CSV csv_filename = f"{results_csvs_dir}/results_up{upgrade_id:02d}.csv.gz" logger.info(f"Writing {csv_filename}") @@ -512,37 +574,63 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): fs.makedirs(results_parquet_dir) parquet_filename = f"{results_parquet_dir}/results_up{upgrade_id:02d}.parquet" logger.info(f"Writing {parquet_filename}") - write_dataframe_as_parquet(df.reset_index(), fs, parquet_filename, schema=schema) + write_dataframe_as_parquet( + df.reset_index(), fs, parquet_filename, schema=schema + ) if do_timeseries: # Get the names of the timeseries file for each simulation in this upgrade ts_upgrade_path = f"{ts_in_dir}/up{upgrade_id:02d}" try: - ts_filenames = [ts_upgrade_path + ts_filename for ts_filename in fs.ls(ts_upgrade_path)] + ts_filenames = [ + ts_upgrade_path + ts_filename + for ts_filename in fs.ls(ts_upgrade_path) + ] except FileNotFoundError: # Upgrade directories may be empty if the upgrade is invalid. In some cloud # filesystems, there aren't actual directories, and trying to list a directory with # no files in it can fail. Just continue post-processing (other upgrades). - logger.warning(f"Listing '{ts_upgrade_path}' failed. Skipping this upgrade.") + logger.warning( + f"Listing '{ts_upgrade_path}' failed. Skipping this upgrade." + ) continue - ts_bldg_ids = [int(re.search(r"bldg(\d+).parquet", flname).group(1)) for flname in ts_filenames] + ts_bldg_ids = [ + int(re.search(r"bldg(\d+).parquet", flname).group(1)) + for flname in ts_filenames + ] if not ts_filenames: - logger.warning(f"There are no timeseries files for upgrade{upgrade_id}.") + logger.warning( + f"There are no timeseries files for upgrade{upgrade_id}." + ) continue - logger.info(f"There are {len(ts_filenames)} timeseries files for upgrade{upgrade_id}.") + logger.info( + f"There are {len(ts_filenames)} timeseries files for upgrade{upgrade_id}." + ) # Calculate the mean and estimate the total memory usage - read_ts_parquet = partial(read_enduse_timeseries_parquet, fs, all_ts_cols_sorted, ts_upgrade_path) - get_ts_mem_usage_d = dask.delayed(lambda x: read_ts_parquet(x).memory_usage(deep=True).sum()) + read_ts_parquet = partial( + read_enduse_timeseries_parquet, fs, all_ts_cols_sorted, ts_upgrade_path + ) + get_ts_mem_usage_d = dask.delayed( + lambda x: read_ts_parquet(x).memory_usage(deep=True).sum() + ) sample_size = min(len(ts_bldg_ids), 36 * 3) - mean_mem = np.mean(dask.compute(map(get_ts_mem_usage_d, random.sample(ts_bldg_ids, sample_size)))[0]) + mean_mem = np.mean( + dask.compute( + map(get_ts_mem_usage_d, random.sample(ts_bldg_ids, sample_size)) + )[0] + ) # Determine how many files should be in each partition and group the files parquet_memory = int( - cfg.get("eagle", {}).get("postprocessing", {}).get("parquet_memory_mb", MAX_PARQUET_MEMORY) + cfg.get("eagle", {}) + .get("postprocessing", {}) + .get("parquet_memory_mb", MAX_PARQUET_MEMORY) ) logger.info(f"Max parquet memory: {parquet_memory} MB") - max_files_per_partition = max(1, math.floor(parquet_memory / (mean_mem / 1e6))) + max_files_per_partition = max( + 1, math.floor(parquet_memory / (mean_mem / 1e6)) + ) partition_df = partition_df.loc[ts_bldg_ids].copy() logger.info(f"partition_df for the upgrade has {len(partition_df)} rows.") bldg_id_groups, bldg_id_list, ngroup = get_partitioned_bldg_groups( @@ -561,7 +649,9 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ts_out_loc = f"s3://{ts_dir}/upgrade={upgrade_id}" fs.makedirs(ts_out_loc) - logger.info(f"Created directory {ts_out_loc} for writing. Now concatenating ...") + logger.info( + f"Created directory {ts_out_loc} for writing. Now concatenating ..." + ) src_path = f"{ts_in_dir}/up{upgrade_id:02d}" concat_partial = dask.delayed( @@ -575,7 +665,11 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ) ) partition_vals_list = [ - (list(partition_df.loc[bldg_id_list[0]].values) if partition_columns else []) + ( + list(partition_df.loc[bldg_id_list[0]].values) + if partition_columns + else [] + ) for bldg_id_list in bldg_id_groups ] @@ -595,7 +689,9 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): f"{results_dir}/dask_combine_report{upgrade_id}.html", ) - logger.info(f"Finished combining and saving timeseries for upgrade{upgrade_id}.") + logger.info( + f"Finished combining and saving timeseries for upgrade{upgrade_id}." + ) logger.info("All aggregation completed. ") if do_timeseries: logger.info("Writing timeseries metadata files") @@ -621,7 +717,9 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): parquet_dir = Path(results_dir).joinpath("parquet") ts_dir = parquet_dir / "timeseries" if not parquet_dir.is_dir(): - logger.error(f"{parquet_dir} does not exist. Please make sure postprocessing has been done.") + logger.error( + f"{parquet_dir} does not exist. Please make sure postprocessing has been done." + ) raise FileNotFoundError(parquet_dir) all_files = [] @@ -633,7 +731,9 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): s3_prefix = aws_conf.get("s3", {}).get("prefix", "").rstrip("/") s3_bucket = aws_conf.get("s3", {}).get("bucket", None) if not (s3_prefix and s3_bucket): - logger.error("YAML file missing postprocessing:aws:s3:prefix and/or bucket entry.") + logger.error( + "YAML file missing postprocessing:aws:s3:prefix and/or bucket entry." + ) return s3_prefix_output = s3_prefix + "/" + output_folder_name + "/" @@ -641,11 +741,15 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): bucket = s3.Bucket(s3_bucket) n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix_output))) if n_existing_files > 0: - logger.error(f"There are already {n_existing_files} files in the s3 folder {s3_bucket}/{s3_prefix_output}.") + logger.error( + f"There are already {n_existing_files} files in the s3 folder {s3_bucket}/{s3_prefix_output}." + ) raise FileExistsError(f"s3://{s3_bucket}/{s3_prefix_output}") def upload_file(filepath, s3key=None): - full_path = filepath if filepath.is_absolute() else parquet_dir.joinpath(filepath) + full_path = ( + filepath if filepath.is_absolute() else parquet_dir.joinpath(filepath) + ) s3 = boto3.resource("s3") bucket = s3.Bucket(s3_bucket) if s3key is None: @@ -665,7 +769,9 @@ def upload_file(filepath, s3key=None): else: logger.warning(f"{buildstock_csv_filename} doesn't exist, can't upload.") dask.compute(tasks) - logger.info(f"Upload to S3 completed. The files are uploaded to: {s3_bucket}/{s3_prefix_output}") + logger.info( + f"Upload to S3 completed. The files are uploaded to: {s3_bucket}/{s3_prefix_output}" + ) return s3_bucket, s3_prefix_output @@ -674,7 +780,9 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): region_name = aws_conf.get("region_name", "us-west-2") db_name = aws_conf.get("athena", {}).get("database_name", None) - role = aws_conf.get("athena", {}).get("glue_service_role", "service-role/AWSGlueServiceRole-default") + role = aws_conf.get("athena", {}).get( + "glue_service_role", "service-role/AWSGlueServiceRole-default" + ) max_crawling_time = aws_conf.get("athena", {}).get("max_crawling_time", 600) assert db_name, "athena:database_name not supplied" @@ -684,12 +792,20 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): s3_path = f"s3://{s3_bucket}/{s3_prefix}" n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix))) if n_existing_files == 0: - logger.warning(f"There are no files in {s3_path}, Athena tables will not be created as intended") + logger.warning( + f"There are no files in {s3_path}, Athena tables will not be created as intended" + ) return glueClient = boto3.client("glue", region_name=region_name) crawlTarget = { - "S3Targets": [{"Path": s3_path, "Exclusions": ["**_metadata", "**_common_metadata"], "SampleSize": 2}] + "S3Targets": [ + { + "Path": s3_path, + "Exclusions": ["**_metadata", "**_common_metadata"], + "SampleSize": 2, + } + ] } crawler_name = db_name + "_" + tbl_prefix tbl_prefix = tbl_prefix + "_" @@ -708,18 +824,26 @@ def create_crawler(): except glueClient.exceptions.AlreadyExistsException: logger.info(f"Deleting existing crawler: {crawler_name}. And creating new one.") glueClient.delete_crawler(Name=crawler_name) - time.sleep(1) # A small delay after deleting is required to prevent AlreadyExistsException again + time.sleep( + 1 + ) # A small delay after deleting is required to prevent AlreadyExistsException again create_crawler() try: - existing_tables = [x["Name"] for x in glueClient.get_tables(DatabaseName=db_name)["TableList"]] + existing_tables = [ + x["Name"] for x in glueClient.get_tables(DatabaseName=db_name)["TableList"] + ] except glueClient.exceptions.EntityNotFoundException: existing_tables = [] to_be_deleted_tables = [x for x in existing_tables if x.startswith(tbl_prefix)] if to_be_deleted_tables: - logger.info(f"Deleting existing tables in db {db_name}: {to_be_deleted_tables}. And creating new ones.") - glueClient.batch_delete_table(DatabaseName=db_name, TablesToDelete=to_be_deleted_tables) + logger.info( + f"Deleting existing tables in db {db_name}: {to_be_deleted_tables}. And creating new ones." + ) + glueClient.batch_delete_table( + DatabaseName=db_name, TablesToDelete=to_be_deleted_tables + ) glueClient.start_crawler(Name=crawler_name) logger.info("Crawler started") @@ -739,7 +863,9 @@ def create_crawler(): logger.debug("Waiting for crawler to stop") else: assert crawler_state == "READY" - metrics = glueClient.get_crawler_metrics(CrawlerNameList=[crawler_name])["CrawlerMetricsList"][0] + metrics = glueClient.get_crawler_metrics(CrawlerNameList=[crawler_name])[ + "CrawlerMetricsList" + ][0] logger.info(f"Crawler has completed running. It is {crawler_state}.") logger.info( f"TablesCreated: {metrics['TablesCreated']} " @@ -753,5 +879,7 @@ def create_crawler(): try: glueClient.delete_crawler(Name=crawler_name) except botocore.exceptions.ClientError as error: - logger.error(f"Could not delete crawler {crawler_name}. Please delete it manually from the AWS console.") + logger.error( + f"Could not delete crawler {crawler_name}. Please delete it manually from the AWS console." + ) raise error diff --git a/buildstockbatch/sampler/base.py b/buildstockbatch/sampler/base.py index 4757a837..41585a0a 100644 --- a/buildstockbatch/sampler/base.py +++ b/buildstockbatch/sampler/base.py @@ -46,14 +46,20 @@ def __init__(self, parent): :param parent: The BuildStockBatchBase object that owns this sampler. """ - self.parent = weakref.ref(parent) # This removes circular references and allows garbage collection to work. + self.parent = weakref.ref( + parent + ) # This removes circular references and allows garbage collection to work. if self.container_runtime in ( ContainerRuntime.DOCKER, ContainerRuntime.LOCAL_OPENSTUDIO, ): - self.csv_path = os.path.join(self.project_dir, "housing_characteristics", "buildstock.csv") + self.csv_path = os.path.join( + self.project_dir, "housing_characteristics", "buildstock.csv" + ) elif self.container_runtime == ContainerRuntime.APPTAINER: - self.csv_path = os.path.join(self.parent().output_dir, "housing_characteristics", "buildstock.csv") + self.csv_path = os.path.join( + self.parent().output_dir, "housing_characteristics", "buildstock.csv" + ) else: self.csv_path = None diff --git a/buildstockbatch/sampler/commercial_sobol.py b/buildstockbatch/sampler/commercial_sobol.py index e7ac35cd..aff5563e 100644 --- a/buildstockbatch/sampler/commercial_sobol.py +++ b/buildstockbatch/sampler/commercial_sobol.py @@ -62,7 +62,10 @@ def validate_args(cls, project_filename, **kw): else: raise ValidationError(f"Unknown argument for sampler: {k}") if len(expected_args) > 0: - raise ValidationError("The following sampler arguments are required: " + ", ".join(expected_args)) + raise ValidationError( + "The following sampler arguments are required: " + + ", ".join(expected_args) + ) return True def run_sampling(self): @@ -84,11 +87,15 @@ def run_sampling(self): for tsv_file in os.listdir(self.buildstock_dir): if ".tsv" in tsv_file: tsv_df = read_csv(os.path.join(self.buildstock_dir, tsv_file), sep="\t") - dependency_columns = [item for item in list(tsv_df) if "Dependency=" in item] + dependency_columns = [ + item for item in list(tsv_df) if "Dependency=" in item + ] tsv_df[dependency_columns] = tsv_df[dependency_columns].astype("str") tsv_hash[tsv_file.replace(".tsv", "")] = tsv_df dependency_hash, attr_order = self._com_order_tsvs(tsv_hash) - sample_matrix = self._com_execute_sobol_sampling(attr_order.__len__(), sample_number) + sample_matrix = self._com_execute_sobol_sampling( + attr_order.__len__(), sample_number + ) csv_path = self.csv_path header = "Building," for item in attr_order: @@ -124,7 +131,9 @@ def _com_execute_sobol_sampling(n_dims, n_samples): :param n_samples: Number of samples to calculate :return: Pandas DataFrame object which contains the low discrepancy result of the sobol algorithm """ - return pd.DataFrame(i4_sobol_generate(n_dims, n_samples, 0)).replace(1.0, 0.999999) + return pd.DataFrame(i4_sobol_generate(n_dims, n_samples, 0)).replace( + 1.0, 0.999999 + ) @staticmethod def _com_order_tsvs(tsv_hash): @@ -137,7 +146,9 @@ def _com_order_tsvs(tsv_hash): dependency_hash = {} for attr in tsv_hash.keys(): dependency_hash[attr] = [ - item.replace("Dependency=", "") for item in list(tsv_hash[attr]) if "Dependency=" in item + item.replace("Dependency=", "") + for item in list(tsv_hash[attr]) + if "Dependency=" in item ] attr_order = [] for attr in dependency_hash.keys(): @@ -159,7 +170,9 @@ def _com_order_tsvs(tsv_hash): elif max_iterations > 0: max_iterations -= 1 else: - raise RuntimeError("Unable to resolve the dependency tree within the set iteration limit") + raise RuntimeError( + "Unable to resolve the dependency tree within the set iteration limit" + ) return dependency_hash, attr_order @staticmethod @@ -193,7 +206,8 @@ def _com_execute_sample( tsv_dist_val = sample_vector[attr_index] for dependency in sample_dependency_hash[attr]: tsv_lkup = tsv_lkup.loc[ - tsv_lkup.loc[:, "Dependency=" + dependency] == sample_dependency_hash[dependency] + tsv_lkup.loc[:, "Dependency=" + dependency] + == sample_dependency_hash[dependency] ] tsv_lkup = tsv_lkup.drop("Dependency=" + dependency, axis=1) if tsv_lkup.shape[0] == 0: @@ -204,9 +218,17 @@ def _com_execute_sample( ) return if tsv_lkup.shape[0] != 1: - raise RuntimeError("Unable to reduce tsv for {} to 1 row, index {}".format(attr, sample_index)) + raise RuntimeError( + "Unable to reduce tsv for {} to 1 row, index {}".format( + attr, sample_index + ) + ) tsv_lkup_cdf = tsv_lkup.values.cumsum() > tsv_dist_val - option_values = [item.replace("Option=", "") for item in list(tsv_lkup) if "Option=" in item] + option_values = [ + item.replace("Option=", "") + for item in list(tsv_lkup) + if "Option=" in item + ] attr_result = list(compress(option_values, tsv_lkup_cdf))[0] sample_dependency_hash[attr] = attr_result result_vector.append(attr_result) diff --git a/buildstockbatch/sampler/downselect.py b/buildstockbatch/sampler/downselect.py index 64375ab7..a634ea4e 100644 --- a/buildstockbatch/sampler/downselect.py +++ b/buildstockbatch/sampler/downselect.py @@ -43,7 +43,11 @@ def __init__(self, parent, n_datapoints, logic, resample=True, **kw): """ super().__init__(parent) self.validate_args( - self.parent().project_filename, n_datapoints=n_datapoints, logic=logic, resample=resample, **kw + self.parent().project_filename, + n_datapoints=n_datapoints, + logic=logic, + resample=resample, + **kw ) self.logic = logic self.resample = resample @@ -66,7 +70,10 @@ def validate_args(cls, project_filename, **kw): else: extra_kw[k] = v if len(expected_args) > 0: - raise ValidationError("The following sampler arguments are required: " + ", ".join(expected_args)) + raise ValidationError( + "The following sampler arguments are required: " + + ", ".join(expected_args) + ) cls.SUB_SAMPLER_CLASS.validate_args(project_filename, **extra_kw) return True @@ -99,21 +106,31 @@ def downselect_logic(cls, df, logic): def run_sampling(self): if self.resample: - logger.debug("Performing initial sampling to figure out number of samples for downselect") + logger.debug( + "Performing initial sampling to figure out number of samples for downselect" + ) n_samples_init = 350000 - init_sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples_init, **self.sub_kw) + init_sampler = self.SUB_SAMPLER_CLASS( + self.parent(), n_datapoints=n_samples_init, **self.sub_kw + ) buildstock_csv_filename = init_sampler.run_sampling() df = read_csv(buildstock_csv_filename, index_col=0, dtype=str) df_new = df[self.downselect_logic(df, self.logic)] downselected_n_samples_init = df_new.shape[0] - n_samples = math.ceil(self.n_datapoints * n_samples_init / downselected_n_samples_init) + n_samples = math.ceil( + self.n_datapoints * n_samples_init / downselected_n_samples_init + ) os.remove(buildstock_csv_filename) del init_sampler else: n_samples = self.n_datapoints - sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples, **self.sub_kw) + sampler = self.SUB_SAMPLER_CLASS( + self.parent(), n_datapoints=n_samples, **self.sub_kw + ) buildstock_csv_filename = sampler.run_sampling() - with gzip.open(os.path.splitext(buildstock_csv_filename)[0] + "_orig.csv.gz", "wb") as f_out: + with gzip.open( + os.path.splitext(buildstock_csv_filename)[0] + "_orig.csv.gz", "wb" + ) as f_out: with open(buildstock_csv_filename, "rb") as f_in: shutil.copyfileobj(f_in, f_out) df = read_csv(buildstock_csv_filename, index_col=0, dtype="str") diff --git a/buildstockbatch/sampler/residential_quota.py b/buildstockbatch/sampler/residential_quota.py index 73d9b185..7eb270e8 100644 --- a/buildstockbatch/sampler/residential_quota.py +++ b/buildstockbatch/sampler/residential_quota.py @@ -50,7 +50,10 @@ def validate_args(cls, project_filename, **kw): else: raise ValidationError(f"Unknown argument for sampler: {k}") if len(expected_args) > 0: - raise ValidationError("The following sampler arguments are required: " + ", ".join(expected_args)) + raise ValidationError( + "The following sampler arguments are required: " + + ", ".join(expected_args) + ) return True def _run_sampling_docker(self): @@ -72,7 +75,9 @@ def _run_sampling_docker(self): "buildstock.csv", ], remove=True, - volumes={self.buildstock_dir: {"bind": "/var/simdata/openstudio", "mode": "rw"}}, + volumes={ + self.buildstock_dir: {"bind": "/var/simdata/openstudio", "mode": "rw"} + }, name="buildstock_sampling", **extra_kws, ) diff --git a/buildstockbatch/test/conftest.py b/buildstockbatch/test/conftest.py index 6a33e507..53fc02aa 100644 --- a/buildstockbatch/test/conftest.py +++ b/buildstockbatch/test/conftest.py @@ -11,7 +11,9 @@ def basic_residential_project_file(): with tempfile.TemporaryDirectory() as test_directory: - def _basic_residential_project_file(update_args={}, raw=False, hpc_name="eagle"): + def _basic_residential_project_file( + update_args={}, raw=False, hpc_name="eagle" + ): output_dir = "simulations_job0" if raw else "simulation_output" buildstock_directory = os.path.join(test_directory, "openstudio_buildstock") shutil.copytree( @@ -35,14 +37,22 @@ def _basic_residential_project_file(update_args={}, raw=False, hpc_name="eagle") ) # move the job*.json file to appropriate location - if os.path.exists(os.path.join(output_directory, "simulation_output", "job0.json")): + if os.path.exists( + os.path.join(output_directory, "simulation_output", "job0.json") + ): shutil.move( os.path.join(output_directory, "simulation_output", "job0.json"), - os.path.join(output_directory, "simulation_output", "..", "..", "job0.json"), + os.path.join( + output_directory, "simulation_output", "..", "..", "job0.json" + ), ) os.mkdir(os.path.join(output_directory, "housing_characteristics")) - os.mkdir(os.path.join(buildstock_directory, project_directory, "housing_characteristics")) + os.mkdir( + os.path.join( + buildstock_directory, project_directory, "housing_characteristics" + ) + ) weather_file_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_inputs", diff --git a/buildstockbatch/test/shared_testing_stuff.py b/buildstockbatch/test/shared_testing_stuff.py index 93470776..479f439e 100644 --- a/buildstockbatch/test/shared_testing_stuff.py +++ b/buildstockbatch/test/shared_testing_stuff.py @@ -11,7 +11,9 @@ pathlib.Path(__file__).resolve().parent.parent.parent.parent / "resstock", ) ) -resstock_required = pytest.mark.skipif(not resstock_directory.exists(), reason="ResStock checkout is not found") +resstock_required = pytest.mark.skipif( + not resstock_directory.exists(), reason="ResStock checkout is not found" +) def check_docker_available(): @@ -23,4 +25,6 @@ def check_docker_available(): return True -docker_available = pytest.mark.skipif(not check_docker_available(), reason="Docker isn't running on this machine") +docker_available = pytest.mark.skipif( + not check_docker_available(), reason="Docker isn't running on this machine" +) diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py index 5dbefcbd..52019ce9 100644 --- a/buildstockbatch/test/test_base.py +++ b/buildstockbatch/test/test_base.py @@ -45,16 +45,25 @@ def test_reference_scenario(basic_residential_project_file): with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( BuildStockBatchBase, "get_dask_client" - ) as get_dask_client_mock, patch.object(BuildStockBatchBase, "results_dir", results_dir): + ) as get_dask_client_mock, patch.object( + BuildStockBatchBase, "results_dir", results_dir + ): bsb = BuildStockBatchBase(project_filename) bsb.process_results() get_dask_client_mock.assert_called_once() # test results.csv files test_path = os.path.join(results_dir, "results_csvs") - test_csv = read_csv(os.path.join(test_path, "results_up01.csv.gz")).set_index("building_id").sort_index() + test_csv = ( + read_csv(os.path.join(test_path, "results_up01.csv.gz")) + .set_index("building_id") + .sort_index() + ) assert len(test_csv["apply_upgrade.reference_scenario"].unique()) == 1 - assert test_csv["apply_upgrade.reference_scenario"].iloc[0] == "example_reference_scenario" + assert ( + test_csv["apply_upgrade.reference_scenario"].iloc[0] + == "example_reference_scenario" + ) def test_downselect_integer_options(basic_residential_project_file, mocker): @@ -71,7 +80,9 @@ def test_downselect_integer_options(basic_residential_project_file, mocker): col_idx = row.index("Days Shifted") else: # Convert values from "Day1" to "1.10" so we hit the bug - row[col_idx] = "{0}.{0}0".format(re.search(r"Day(\d+)", row[col_idx]).group(1)) + row[col_idx] = "{0}.{0}0".format( + re.search(r"Day(\d+)", row[col_idx]).group(1) + ) valid_option_values.add(row[col_idx]) cf_out.writerow(row) @@ -89,7 +100,9 @@ def test_downselect_integer_options(basic_residential_project_file, mocker): ) mocker.patch.object(BuildStockBatchBase, "weather_dir", None) mocker.patch.object(BuildStockBatchBase, "results_dir", results_dir) - sampler_property_mock = mocker.patch.object(BuildStockBatchBase, "sampler", new_callable=PropertyMock) + sampler_property_mock = mocker.patch.object( + BuildStockBatchBase, "sampler", new_callable=PropertyMock + ) sampler_mock = mocker.MagicMock() sampler_property_mock.return_value = sampler_mock sampler_mock.run_sampling = MagicMock(return_value=buildstock_csv) @@ -130,7 +143,9 @@ def test_upload_files(mocker, basic_residential_project_file): } mocked_glueclient = MagicMock() mocked_glueclient.get_crawler = MagicMock( - return_value={"Crawler": {"State": "READY", "LastCrawl": {"Status": "SUCCEEDED"}}} + return_value={ + "Crawler": {"State": "READY", "LastCrawl": {"Status": "SUCCEEDED"}} + } ) mocked_boto3.client = MagicMock(return_value=mocked_glueclient) mocked_boto3.resource().Bucket().objects.filter.side_effect = [[], ["a", "b", "c"]] @@ -143,14 +158,19 @@ def test_upload_files(mocker, basic_residential_project_file): / "buildstock.csv" ) # noqa: E501 shutil.copy2( - Path(__file__).parent / "test_results" / "housing_characteristics" / "buildstock.csv", + Path(__file__).parent + / "test_results" + / "housing_characteristics" + / "buildstock.csv", buildstock_csv_path, ) mocker.patch.object(BuildStockBatchBase, "weather_dir", None) mocker.patch.object(BuildStockBatchBase, "output_dir", results_dir) get_dask_client_mock = mocker.patch.object(BuildStockBatchBase, "get_dask_client") mocker.patch.object(BuildStockBatchBase, "results_dir", results_dir) - mocker.patch.object(BuildStockBatchBase, "CONTAINER_RUNTIME", ContainerRuntime.LOCAL_OPENSTUDIO) + mocker.patch.object( + BuildStockBatchBase, "CONTAINER_RUNTIME", ContainerRuntime.LOCAL_OPENSTUDIO + ) bsb = BuildStockBatchBase(project_filename) bsb.process_results() @@ -173,13 +193,25 @@ def test_upload_files(mocker, basic_residential_project_file): if call_function == "create_crawler": crawler_para = call[2] # 2 is for the keyword arguments crawler_created = True - assert crawler_para["DatabaseName"] == upload_config["postprocessing"]["aws"]["athena"]["database_name"] - assert crawler_para["Role"] == upload_config["postprocessing"]["aws"]["athena"]["glue_service_role"] + assert ( + crawler_para["DatabaseName"] + == upload_config["postprocessing"]["aws"]["athena"]["database_name"] + ) + assert ( + crawler_para["Role"] + == upload_config["postprocessing"]["aws"]["athena"]["glue_service_role"] + ) assert crawler_para["TablePrefix"] == OUTPUT_FOLDER_NAME + "_" assert crawler_para["Name"] == db_name + "_" + OUTPUT_FOLDER_NAME assert ( crawler_para["Targets"]["S3Targets"][0]["Path"] - == "s3://" + s3_bucket + "/" + s3_prefix + "/" + OUTPUT_FOLDER_NAME + "/" + == "s3://" + + s3_bucket + + "/" + + s3_prefix + + "/" + + OUTPUT_FOLDER_NAME + + "/" ) if call_function == "start_crawler": assert crawler_created, "crawler attempted to start before creating" @@ -199,17 +231,23 @@ def test_upload_files(mocker, basic_residential_project_file): files_uploaded.remove((source_file_path, s3_file_path)) s3_file_path = s3_path + "upgrades/upgrade=1/results_up01.parquet" - source_file_path = os.path.join(source_path, "upgrades", "upgrade=1", "results_up01.parquet") + source_file_path = os.path.join( + source_path, "upgrades", "upgrade=1", "results_up01.parquet" + ) assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) s3_file_path = s3_path + "timeseries/upgrade=0/group0.parquet" - source_file_path = os.path.join(source_path, "timeseries", "upgrade=0", "group0.parquet") + source_file_path = os.path.join( + source_path, "timeseries", "upgrade=0", "group0.parquet" + ) assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) s3_file_path = s3_path + "timeseries/upgrade=1/group0.parquet" - source_file_path = os.path.join(source_path, "timeseries", "upgrade=1", "group0.parquet") + source_file_path = os.path.join( + source_path, "timeseries", "upgrade=1", "group0.parquet" + ) assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) @@ -228,7 +266,9 @@ def test_upload_files(mocker, basic_residential_project_file): assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - assert len(files_uploaded) == 0, f"These files shouldn't have been uploaded: {files_uploaded}" + assert ( + len(files_uploaded) == 0 + ), f"These files shouldn't have been uploaded: {files_uploaded}" def test_write_parquet_no_index(): @@ -249,7 +289,9 @@ def test_skipping_baseline(basic_residential_project_file): ) sim_output_path = os.path.join(results_dir, "simulation_output") - shutil.rmtree(os.path.join(sim_output_path, "timeseries", "up00")) # remove timeseries results for baseline + shutil.rmtree( + os.path.join(sim_output_path, "timeseries", "up00") + ) # remove timeseries results for baseline # remove results.csv data for baseline from results_jobx.json.gz results_json_filename = os.path.join(sim_output_path, "results_job0.json.gz") @@ -269,15 +311,21 @@ def test_skipping_baseline(basic_residential_project_file): # run postprocessing with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( BuildStockBatchBase, "get_dask_client" - ) as get_dask_client_mock, patch.object(BuildStockBatchBase, "results_dir", results_dir): + ) as get_dask_client_mock, patch.object( + BuildStockBatchBase, "results_dir", results_dir + ): bsb = BuildStockBatchBase(project_filename) bsb.process_results() get_dask_client_mock.assert_called_once() - up00_parquet = os.path.join(results_dir, "parquet", "baseline", "results_up00.parquet") + up00_parquet = os.path.join( + results_dir, "parquet", "baseline", "results_up00.parquet" + ) assert not os.path.exists(up00_parquet) - up01_parquet = os.path.join(results_dir, "parquet", "upgrades", "upgrade=1", "results_up01.parquet") + up01_parquet = os.path.join( + results_dir, "parquet", "upgrades", "upgrade=1", "results_up01.parquet" + ) assert os.path.exists(up01_parquet) up00_csv_gz = os.path.join(results_dir, "results_csvs", "results_up00.csv.gz") @@ -300,7 +348,9 @@ def test_provide_buildstock_csv(basic_residential_project_file, mocker): sampling_output_csv = bsb.sampler.run_sampling() df2 = read_csv(sampling_output_csv, dtype=str) pd.testing.assert_frame_equal(df, df2) - assert (df["Geometry Shared Walls"] == "None").all() # Verify None is being read properly + assert ( + df["Geometry Shared Walls"] == "None" + ).all() # Verify None is being read properly # Test file missing with open(project_filename, "r") as f: cfg = yaml.safe_load(f) diff --git a/buildstockbatch/test/test_docker_base.py b/buildstockbatch/test/test_docker_base.py index 88e70f1a..4ac1a4fb 100644 --- a/buildstockbatch/test/test_docker_base.py +++ b/buildstockbatch/test/test_docker_base.py @@ -15,7 +15,9 @@ from buildstockbatch.utils import get_project_configuration here = os.path.dirname(os.path.abspath(__file__)) -resources_dir = os.path.join(here, "test_inputs", "test_openstudio_buildstock", "resources") +resources_dir = os.path.join( + here, "test_inputs", "test_openstudio_buildstock", "resources" +) @docker_available @@ -24,11 +26,15 @@ def test_run_batch_prep(basic_residential_project_file, mocker): project_filename, results_dir = basic_residential_project_file() mocker.patch.object(DockerBatchBase, "results_dir", results_dir) - sampler_property_mock = mocker.patch.object(DockerBatchBase, "sampler", new_callable=PropertyMock) + sampler_property_mock = mocker.patch.object( + DockerBatchBase, "sampler", new_callable=PropertyMock + ) sampler_mock = mocker.MagicMock() sampler_property_mock.return_value = sampler_mock # Hard-coded sampling output includes 5 buildings. - sampler_mock.run_sampling = MagicMock(return_value=os.path.join(resources_dir, "buildstock_good.csv")) + sampler_mock.run_sampling = MagicMock( + return_value=os.path.join(resources_dir, "buildstock_good.csv") + ) dbb = DockerBatchBase(project_filename) dbb.batch_array_size = 3 @@ -44,9 +50,9 @@ def test_run_batch_prep(basic_residential_project_file, mocker): # * "G2601210.epw" and "G2601390.epw" are dupes. One should be in # tmppath; one should be copied to the other according to ``epws_to_copy`` assert os.path.isfile(tmppath / "weather" / "G2500210.epw.gz") - assert os.path.isfile(tmppath / "weather" / "G2601210.epw.gz") or os.path.isfile( - tmppath / "weather" / "G2601390.epw.gz" - ) + assert os.path.isfile( + tmppath / "weather" / "G2601210.epw.gz" + ) or os.path.isfile(tmppath / "weather" / "G2601390.epw.gz") src, dest = epws_to_copy[0] assert src in ("G2601210.epw.gz", "G2601390.epw.gz") assert dest in ("G2601210.epw.gz", "G2601390.epw.gz") @@ -59,14 +65,22 @@ def test_run_batch_prep(basic_residential_project_file, mocker): assert batch_info.job_count == 3 jobs_file_path = tmppath / "jobs.tar.gz" with tarfile.open(jobs_file_path, "r") as tar_f: - all_job_files = ["jobs", "jobs/job00000.json", "jobs/job00001.json", "jobs/job00002.json"] + all_job_files = [ + "jobs", + "jobs/job00000.json", + "jobs/job00001.json", + "jobs/job00002.json", + ] assert tar_f.getnames() == all_job_files simulations = [] for filename in all_job_files[1:]: job = json.load(tar_f.extractfile(filename)) assert filename == f"jobs/job{job['job_num']:05d}.json" assert job["n_datapoints"] == 5 # Total number of buildings - assert len(job["batch"]) in (2, 4) # Number of simulations in this batch + assert len(job["batch"]) in ( + 2, + 4, + ) # Number of simulations in this batch simulations.extend(job["batch"]) # Check that all 10 expected simulations are present @@ -88,7 +102,10 @@ def test_get_epws_to_download(): os.makedirs(sim_dir / "lib" / "resources") os.makedirs(sim_dir / "lib" / "housing_characteristics") shutil.copy(options_file, sim_dir / "lib" / "resources") - shutil.copy(buildstock_file, sim_dir / "lib" / "housing_characteristics" / "buildstock.csv") + shutil.copy( + buildstock_file, + sim_dir / "lib" / "housing_characteristics" / "buildstock.csv", + ) jobs_d = { "job_num": 0, @@ -133,10 +150,15 @@ def test_run_simulations(basic_residential_project_file): bucket = temp_path / "bucket" os.makedirs(bucket / "test_prefix" / "results" / "simulation_output") - DockerBatchBase.run_simulations(cfg, 0, jobs_d, sim_dir, fs, f"{bucket}/test_prefix") + DockerBatchBase.run_simulations( + cfg, 0, jobs_d, sim_dir, fs, f"{bucket}/test_prefix" + ) output_dir = bucket / "test_prefix" / "results" / "simulation_output" - assert sorted(os.listdir(output_dir)) == ["results_job0.json.gz", "simulations_job0.tar.gz"] + assert sorted(os.listdir(output_dir)) == [ + "results_job0.json.gz", + "simulations_job0.tar.gz", + ] # Check that buildings 1 and 5 (specified in jobs_d) are in the results with gzip.open(output_dir / "results_job0.json.gz", "r") as f: diff --git a/buildstockbatch/test/test_hpc.py b/buildstockbatch/test/test_hpc.py index c57a2d88..e84ca70c 100644 --- a/buildstockbatch/test/test_hpc.py +++ b/buildstockbatch/test/test_hpc.py @@ -9,7 +9,13 @@ from unittest.mock import patch import gzip -from buildstockbatch.hpc import eagle_cli, kestrel_cli, EagleBatch, KestrelBatch, SlurmBatch # noqa: F401 +from buildstockbatch.hpc import ( + eagle_cli, + kestrel_cli, + EagleBatch, + KestrelBatch, + SlurmBatch, +) # noqa: F401 from buildstockbatch.base import BuildStockBatchBase from buildstockbatch.utils import get_project_configuration, read_csv @@ -19,10 +25,15 @@ @patch("buildstockbatch.hpc.subprocess") def test_hpc_run_building(mock_subprocess, monkeypatch, basic_residential_project_file): tar_filename = ( - pathlib.Path(__file__).resolve().parent / "test_results" / "simulation_output" / "simulations_job0.tar.gz" + pathlib.Path(__file__).resolve().parent + / "test_results" + / "simulation_output" + / "simulations_job0.tar.gz" ) # noqa E501 with tarfile.open(tar_filename, "r") as tarf: - osw_dict = json.loads(tarf.extractfile("up00/bldg0000001/in.osw").read().decode("utf-8")) + osw_dict = json.loads( + tarf.extractfile("up00/bldg0000001/in.osw").read().decode("utf-8") + ) project_filename, results_dir = basic_residential_project_file() tmp_path = pathlib.Path(results_dir).parent @@ -33,7 +44,9 @@ def test_hpc_run_building(mock_subprocess, monkeypatch, basic_residential_projec with patch.object(KestrelBatch, "weather_dir", None), patch.object( KestrelBatch, "create_osw", return_value=osw_dict - ), patch.object(KestrelBatch, "make_sim_dir", return_value=("bldg0000001up00", sim_path)), patch.object( + ), patch.object( + KestrelBatch, "make_sim_dir", return_value=("bldg0000001up00", sim_path) + ), patch.object( KestrelBatch, "local_scratch", tmp_path ): # Normal run @@ -113,11 +126,15 @@ def _test_env_vars_passed(mock_subprocess, hpc_name): @pytest.mark.parametrize("hpc_name", ["eagle", "kestrel"]) def test_user_cli(basic_residential_project_file, monkeypatch, mocker, hpc_name): mock_subprocess = mocker.patch("buildstockbatch.hpc.subprocess") - mock_validate_apptainer_image = mocker.patch("buildstockbatch.hpc.SlurmBatch.validate_apptainer_image_hpc") + mock_validate_apptainer_image = mocker.patch( + "buildstockbatch.hpc.SlurmBatch.validate_apptainer_image_hpc" + ) mock_validate_output_directory = mocker.patch( f"buildstockbatch.hpc.{hpc_name.capitalize()}Batch.validate_output_directory_{hpc_name}" ) - mock_validate_options = mocker.patch("buildstockbatch.base.BuildStockBatchBase.validate_options_lookup") + mock_validate_options = mocker.patch( + "buildstockbatch.base.BuildStockBatchBase.validate_options_lookup" + ) mock_validate_options.return_value = True mock_validate_output_directory.return_value = True @@ -189,11 +206,15 @@ def test_user_cli(basic_residential_project_file, monkeypatch, mocker, hpc_name) @pytest.mark.parametrize("hpc_name", ["eagle", "kestrel"]) -def test_qos_high_job_submit(basic_residential_project_file, monkeypatch, mocker, hpc_name): +def test_qos_high_job_submit( + basic_residential_project_file, monkeypatch, mocker, hpc_name +): mock_subprocess = mocker.patch("buildstockbatch.hpc.subprocess") mock_subprocess.run.return_value.stdout = "Submitted batch job 1\n" mock_subprocess.PIPE = None - mocker.patch.object(SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") + mocker.patch.object( + SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" + ) Batch = eval(f"{hpc_name.capitalize()}Batch") mocker.patch.object(SlurmBatch, "weather_dir", None) project_filename, results_dir = basic_residential_project_file(hpc_name=hpc_name) @@ -225,11 +246,15 @@ def test_qos_high_job_submit(basic_residential_project_file, monkeypatch, mocker @pytest.mark.parametrize("hpc_name", ["eagle", "kestrel"]) -def test_queue_jobs_minutes_per_sim(mocker, basic_residential_project_file, monkeypatch, hpc_name): +def test_queue_jobs_minutes_per_sim( + mocker, basic_residential_project_file, monkeypatch, hpc_name +): mock_subprocess = mocker.patch("buildstockbatch.hpc.subprocess") Batch = eval(f"{hpc_name.capitalize()}Batch") mocker.patch.object(Batch, "weather_dir", None) - mocker.patch.object(SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") + mocker.patch.object( + SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" + ) mock_subprocess.run.return_value.stdout = "Submitted batch job 1\n" mock_subprocess.PIPE = None project_filename, results_dir = basic_residential_project_file( @@ -280,10 +305,14 @@ def test_run_building_process(mocker, basic_residential_project_file): with open(results_dir / "job001.json", "w") as f: json.dump(job_json, f) - sample_buildstock_csv = pd.DataFrame.from_records([{"Building": i, "Dummy Column": i * i} for i in range(10)]) + sample_buildstock_csv = pd.DataFrame.from_records( + [{"Building": i, "Dummy Column": i * i} for i in range(10)] + ) os.makedirs(results_dir / "housing_characteristics", exist_ok=True) os.makedirs(results_dir / "weather", exist_ok=True) - sample_buildstock_csv.to_csv(results_dir / "housing_characteristics" / "buildstock.csv", index=False) + sample_buildstock_csv.to_csv( + results_dir / "housing_characteristics" / "buildstock.csv", index=False + ) def sequential_parallel(**kwargs): kw2 = kwargs.copy() @@ -294,9 +323,15 @@ def sequential_parallel(**kwargs): rmtree_mock = mocker.patch("buildstockbatch.hpc.shutil.rmtree") mocker.patch("buildstockbatch.hpc.Parallel", sequential_parallel) mocker.patch("buildstockbatch.hpc.subprocess") - mocker.patch.object(SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") - mocker.patch.object(KestrelBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir") - mocker.patch.object(KestrelBatch, "local_weather_dir", results_dir / "local_weather_dir") + mocker.patch.object( + SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" + ) + mocker.patch.object( + KestrelBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir" + ) + mocker.patch.object( + KestrelBatch, "local_weather_dir", results_dir / "local_weather_dir" + ) mocker.patch.object(KestrelBatch, "local_output_dir", results_dir) mocker.patch.object( KestrelBatch, @@ -309,14 +344,20 @@ def sequential_parallel(**kwargs): def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=False): real_upgrade_idx = 0 if upgrade_idx is None else upgrade_idx + 1 sim_id = f"bldg{building_id:07d}up{real_upgrade_idx:02d}" - sim_dir = os.path.join(base_dir, f"up{real_upgrade_idx:02d}", f"bldg{building_id:07d}") + sim_dir = os.path.join( + base_dir, f"up{real_upgrade_idx:02d}", f"bldg{building_id:07d}" + ) return sim_id, sim_dir mocker.patch.object(KestrelBatch, "make_sim_dir", make_sim_dir_mock) - sampler_prop_mock = mocker.patch.object(KestrelBatch, "sampler", new_callable=mocker.PropertyMock) + sampler_prop_mock = mocker.patch.object( + KestrelBatch, "sampler", new_callable=mocker.PropertyMock + ) sampler_mock = mocker.MagicMock() sampler_prop_mock.return_value = sampler_mock - sampler_mock.csv_path = results_dir.parent / "housing_characteristic2" / "buildstock.csv" + sampler_mock.csv_path = ( + results_dir.parent / "housing_characteristic2" / "buildstock.csv" + ) sampler_mock.run_sampling = mocker.MagicMock(return_value="buildstock.csv") b = KestrelBatch(project_filename) @@ -329,11 +370,19 @@ def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=Fal rmtree_mock.assert_any_call(b.local_housing_characteristics_dir) # check results job-json - refrence_path = pathlib.Path(__file__).resolve().parent / "test_results" / "reference_files" + refrence_path = ( + pathlib.Path(__file__).resolve().parent / "test_results" / "reference_files" + ) - refrence_list = json.loads(gzip.open(refrence_path / "results_job1.json.gz", "r").read()) + refrence_list = json.loads( + gzip.open(refrence_path / "results_job1.json.gz", "r").read() + ) - output_list = json.loads(gzip.open(results_dir / "simulation_output" / "results_job1.json.gz", "r").read()) + output_list = json.loads( + gzip.open( + results_dir / "simulation_output" / "results_job1.json.gz", "r" + ).read() + ) refrence_list = [json.dumps(d) for d in refrence_list] output_list = [json.dumps(d) for d in output_list] @@ -343,16 +392,35 @@ def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=Fal ts_files = list(refrence_path.glob("**/*.parquet")) def compare_ts_parquets(source, dst): - test_pq = pd.read_parquet(source).reset_index().drop(columns=["index"]).rename(columns=str.lower) - reference_pq = pd.read_parquet(dst).reset_index().drop(columns=["index"]).rename(columns=str.lower) + test_pq = ( + pd.read_parquet(source) + .reset_index() + .drop(columns=["index"]) + .rename(columns=str.lower) + ) + reference_pq = ( + pd.read_parquet(dst) + .reset_index() + .drop(columns=["index"]) + .rename(columns=str.lower) + ) pd.testing.assert_frame_equal(test_pq, reference_pq) for file in ts_files: - results_file = results_dir / "results" / "simulation_output" / "timeseries" / file.parent.name / file.name + results_file = ( + results_dir + / "results" + / "simulation_output" + / "timeseries" + / file.parent.name + / file.name + ) compare_ts_parquets(file, results_file) # Check that buildstock.csv was trimmed properly - local_buildstock_df = read_csv(results_dir / "local_housing_characteristics_dir" / "buildstock.csv", dtype=str) + local_buildstock_df = read_csv( + results_dir / "local_housing_characteristics_dir" / "buildstock.csv", dtype=str + ) unique_buildings = {str(x[0]) for x in job_json["batch"]} assert len(unique_buildings) == len(local_buildstock_df) assert unique_buildings == set(local_buildstock_df["Building"]) @@ -366,11 +434,15 @@ def test_run_building_error_caught(mocker, basic_residential_project_file): with open(results_dir / "job001.json", "w") as f: json.dump(job_json, f) - sample_buildstock_csv = pd.DataFrame.from_records([{"Building": i, "Dummy Column": i * i} for i in range(10)]) + sample_buildstock_csv = pd.DataFrame.from_records( + [{"Building": i, "Dummy Column": i * i} for i in range(10)] + ) os.makedirs(results_dir / "housing_characteristics", exist_ok=True) os.makedirs(results_dir / "local_housing_characteristics", exist_ok=True) os.makedirs(results_dir / "weather", exist_ok=True) - sample_buildstock_csv.to_csv(results_dir / "housing_characteristics" / "buildstock.csv", index=False) + sample_buildstock_csv.to_csv( + results_dir / "housing_characteristics" / "buildstock.csv", index=False + ) def raise_error(*args, **kwargs): raise RuntimeError("A problem happened") @@ -384,12 +456,18 @@ def sequential_parallel(**kwargs): mocker.patch("buildstockbatch.hpc.shutil.rmtree") mocker.patch("buildstockbatch.hpc.Parallel", sequential_parallel) mocker.patch("buildstockbatch.hpc.subprocess") - mocker.patch.object(SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") + mocker.patch.object( + SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" + ) mocker.patch.object(KestrelBatch, "run_building", raise_error) mocker.patch.object(KestrelBatch, "local_output_dir", results_dir) mocker.patch.object(KestrelBatch, "results_dir", results_dir) - mocker.patch.object(KestrelBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir") - mocker.patch.object(KestrelBatch, "local_weather_dir", results_dir / "local_weather_dir") + mocker.patch.object( + KestrelBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir" + ) + mocker.patch.object( + KestrelBatch, "local_weather_dir", results_dir / "local_weather_dir" + ) mocker.patch.object( KestrelBatch, "local_housing_characteristics_dir", @@ -413,9 +491,15 @@ def test_rerun_failed_jobs(mocker, basic_residential_project_file): mocker.patch.object(KestrelBatch, "weather_dir", None) mocker.patch.object(KestrelBatch, "results_dir", results_dir) process_results_mocker = mocker.patch.object(BuildStockBatchBase, "process_results") - queue_jobs_mocker = mocker.patch.object(KestrelBatch, "queue_jobs", return_value=[42]) - queue_post_processing_mocker = mocker.patch.object(KestrelBatch, "queue_post_processing") - mocker.patch.object(KestrelBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") + queue_jobs_mocker = mocker.patch.object( + KestrelBatch, "queue_jobs", return_value=[42] + ) + queue_post_processing_mocker = mocker.patch.object( + KestrelBatch, "queue_post_processing" + ) + mocker.patch.object( + KestrelBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" + ) b = KestrelBatch(project_filename) diff --git a/buildstockbatch/test/test_local.py b/buildstockbatch/test/test_local.py index 6c409598..bf01d0c3 100644 --- a/buildstockbatch/test/test_local.py +++ b/buildstockbatch/test/test_local.py @@ -44,7 +44,11 @@ def test_resstock_local_batch(project_filename): n_datapoints = 2 batch.cfg["sampler"]["args"]["n_datapoints"] = n_datapoints - local_weather_file = resstock_directory.parent / "weather" / batch.cfg["weather_files_url"].split("/")[-1] + local_weather_file = ( + resstock_directory.parent + / "weather" + / batch.cfg["weather_files_url"].split("/")[-1] + ) if local_weather_file.exists(): del batch.cfg["weather_files_url"] batch.cfg["weather_files_path"] = str(local_weather_file) @@ -59,7 +63,12 @@ def test_resstock_local_batch(project_filename): for upgrade_id in range(0, n_upgrades + 1): for bldg_id in range(1, n_datapoints + 1): - assert (simout_path / "timeseries" / f"up{upgrade_id:02d}" / f"bldg{bldg_id:07d}.parquet").exists() + assert ( + simout_path + / "timeseries" + / f"up{upgrade_id:02d}" + / f"bldg{bldg_id:07d}.parquet" + ).exists() batch.process_results() @@ -68,7 +77,9 @@ def test_resstock_local_batch(project_filename): assert (simout_path / "simulations_job0.tar.gz").exists() base_pq = out_path / "parquet" / "baseline" / "results_up00.parquet" assert base_pq.exists() - base = pd.read_parquet(base_pq, columns=["completed_status", "started_at", "completed_at"]) + base = pd.read_parquet( + base_pq, columns=["completed_status", "started_at", "completed_at"] + ) assert (base["completed_status"] == "Success").all() assert base.dtypes["started_at"] == "timestamp[s][pyarrow]" assert base.dtypes["completed_at"] == "timestamp[s][pyarrow]" @@ -81,9 +92,17 @@ def test_resstock_local_batch(project_filename): tsdf = pd.read_parquet(ts_pq_filename, columns=ts_time_cols) for col in tsdf.columns: assert tsdf[col].dtype == "timestamp[s][pyarrow]" - assert (out_path / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz").exists() + assert ( + out_path / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz" + ).exists() if upgrade_id >= 1: - upg_pq = out_path / "parquet" / "upgrades" / f"upgrade={upgrade_id}" / f"results_up{upgrade_id:02d}.parquet" + upg_pq = ( + out_path + / "parquet" + / "upgrades" + / f"upgrade={upgrade_id}" + / f"results_up{upgrade_id:02d}.parquet" + ) assert upg_pq.exists() upg = pd.read_parquet(upg_pq, columns=["completed_status"]) assert (upg["completed_status"] == "Success").all() @@ -103,7 +122,9 @@ def mocked_subprocess_run(run_cmd, **kwargs): mocker.patch("buildstockbatch.local.subprocess.run", mocked_subprocess_run) sleep_mock = mocker.patch("buildstockbatch.local.time.sleep") - cfg = get_project_configuration(resstock_directory / "project_national" / "national_baseline.yml") + cfg = get_project_configuration( + resstock_directory / "project_national" / "national_baseline.yml" + ) cfg["max_minutes_per_sim"] = 5 with tempfile.TemporaryDirectory() as tmpdir: @@ -132,7 +153,9 @@ def mocked_subprocess_run(run_cmd, **kwargs): assert out_osw["completed_status"] == "Fail" assert msg_re.search(out_osw["timeout"]) - err_log_re = re.compile(r"\[\d\d:\d\d:\d\d ERROR\] Terminated \w+ after reaching max time") + err_log_re = re.compile( + r"\[\d\d:\d\d:\d\d ERROR\] Terminated \w+ after reaching max time" + ) with open(sim_path / "run" / "run.log", "r") as run_log: err_log_re.search(run_log.read()) with open(sim_path / "run" / "failed.job", "r") as failed_job: diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py index 667faa7f..d11fdb86 100644 --- a/buildstockbatch/test/test_postprocessing.py +++ b/buildstockbatch/test/test_postprocessing.py @@ -19,7 +19,9 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): reporting_measures = ["ReportingMeasure1", "ReportingMeasure2"] - project_filename, results_dir = basic_residential_project_file({"reporting_measures": reporting_measures}) + project_filename, results_dir = basic_residential_project_file( + {"reporting_measures": reporting_measures} + ) fs = LocalFileSystem() @@ -40,7 +42,11 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): sim_dir = str(filename.parent.parent) upgrade_id = int(re.search(r"up(\d+)", sim_dir).group(1)) building_id = int(re.search(r"bldg(\d+)", sim_dir).group(1)) - dpouts2.append(postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, building_id)) + dpouts2.append( + postprocessing.read_simulation_outputs( + fs, reporting_measures, sim_dir, upgrade_id, building_id + ) + ) with gzip.open(sim_out_dir / "results_job0.json.gz", "wt", encoding="utf-8") as f: json.dump(dpouts2, f) @@ -50,7 +56,9 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False) for upgrade_id in (0, 1): - df = read_csv(str(results_dir / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz")) + df = read_csv( + str(results_dir / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz") + ) assert (df["reporting_measure1.column_1"] == 1).all() assert (df["reporting_measure1.column_2"] == 2).all() assert (df["reporting_measure2.column_3"] == 3).all() @@ -66,7 +74,9 @@ def test_empty_results_assertion(basic_residential_project_file, capsys): shutil.rmtree(sim_out_dir) # no results cfg = get_project_configuration(project_filename) - with pytest.raises(ValueError, match=r"No simulation results found to post-process"): + with pytest.raises( + ValueError, match=r"No simulation results found to post-process" + ): assert postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False) @@ -86,7 +96,9 @@ def test_large_parquet_combine(basic_residential_project_file): @pytest.mark.parametrize("keep_individual_timeseries", [True, False]) -def test_keep_individual_timeseries(keep_individual_timeseries, basic_residential_project_file, mocker): +def test_keep_individual_timeseries( + keep_individual_timeseries, basic_residential_project_file, mocker +): project_filename, results_dir = basic_residential_project_file( {"postprocessing": {"keep_individual_timeseries": keep_individual_timeseries}} ) @@ -110,7 +122,9 @@ def test_upgrade_missing_ts(basic_residential_project_file, mocker, caplog): project_filename, results_dir = basic_residential_project_file() results_path = pathlib.Path(results_dir) - for filename in (results_path / "simulation_output" / "timeseries" / "up01").glob("*.parquet"): + for filename in (results_path / "simulation_output" / "timeseries" / "up01").glob( + "*.parquet" + ): os.remove(filename) mocker.patch.object(BuildStockBatchBase, "weather_dir", None) diff --git a/buildstockbatch/test/test_validation.py b/buildstockbatch/test/test_validation.py index ee915940..92434751 100644 --- a/buildstockbatch/test/test_validation.py +++ b/buildstockbatch/test/test_validation.py @@ -34,7 +34,9 @@ here = os.path.dirname(os.path.abspath(__file__)) example_yml_dir = os.path.join(here, "test_inputs") -resources_dir = os.path.join(here, "test_inputs", "test_openstudio_buildstock", "resources") +resources_dir = os.path.join( + here, "test_inputs", "test_openstudio_buildstock", "resources" +) def filter_logs(logs, level): @@ -67,11 +69,15 @@ def test_aws_batch_validation_is_static(): def test_complete_schema_passes_validation(): - assert BuildStockBatchBase.validate_project_schema(os.path.join(example_yml_dir, "complete-schema.yml")) + assert BuildStockBatchBase.validate_project_schema( + os.path.join(example_yml_dir, "complete-schema.yml") + ) def test_minimal_schema_passes_validation(): - assert BuildStockBatchBase.validate_project_schema(os.path.join(example_yml_dir, "minimal-schema.yml")) + assert BuildStockBatchBase.validate_project_schema( + os.path.join(example_yml_dir, "minimal-schema.yml") + ) @pytest.mark.parametrize( @@ -129,9 +135,13 @@ def test_xor_violations_fail(project_file, expected): ) def test_validation_integration(project_file, base_expected, eagle_expected): # patch the validate_options_lookup function to always return true for this case - with patch.object(BuildStockBatchBase, "validate_options_lookup", lambda _: True), patch.object( + with patch.object( + BuildStockBatchBase, "validate_options_lookup", lambda _: True + ), patch.object( BuildStockBatchBase, "validate_measure_references", lambda _: True - ), patch.object(BuildStockBatchBase, "validate_workflow_generator", lambda _: True), patch.object( + ), patch.object( + BuildStockBatchBase, "validate_workflow_generator", lambda _: True + ), patch.object( BuildStockBatchBase, "validate_postprocessing_spec", lambda _: True ), patch.object( SlurmBatch, "validate_apptainer_image_hpc", lambda _: True @@ -182,10 +192,14 @@ def test_bad_measures(project_file): except (ValidationError, YamaleError) as er: er = str(er) assert "'1.5' is not a int" in er - assert "'huorly' not in ('none', 'timestep', 'hourly', 'daily', 'monthly')" in er + assert ( + "'huorly' not in ('none', 'timestep', 'hourly', 'daily', 'monthly')" + in er + ) else: raise Exception( - "measures_and_arguments was supposed to raise ValidationError for" " enforce-validate-measures-bad.yml" + "measures_and_arguments was supposed to raise ValidationError for" + " enforce-validate-measures-bad.yml" ) @@ -193,7 +207,9 @@ def test_bad_measures(project_file): "project_file", [ os.path.join(example_yml_dir, "enforce-validate-measures-good-2.yml"), - os.path.join(example_yml_dir, "enforce-validate-measures-good-2-with-anchors.yml"), + os.path.join( + example_yml_dir, "enforce-validate-measures-good-2-with-anchors.yml" + ), ], ) def test_good_measures(project_file): @@ -258,7 +274,9 @@ def test_bad_options_validation(project_file): assert "Floor Insulation: '*' cannot be mixed with other options" in er else: - raise Exception("validate_options was supposed to raise ValueError for enforce-validate-options-bad.yml") + raise Exception( + "validate_options was supposed to raise ValueError for enforce-validate-options-bad.yml" + ) @pytest.mark.parametrize( @@ -289,7 +307,8 @@ def test_bad_measures_validation(project_file): else: raise Exception( - "validate_measure_references was supposed to raise ValueError for " "enforce-validate-measures-bad.yml" + "validate_measure_references was supposed to raise ValueError for " + "enforce-validate-measures-bad.yml" ) @@ -306,10 +325,14 @@ def test_bad_postprocessing_spec_validation(project_file): er = str(er) assert "bad_partition_column" in er else: - raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-bad-2.yml") + raise Exception( + "validate_options was supposed to raise ValidationError for enforce-validate-options-bad-2.yml" + ) -@pytest.mark.parametrize("project_file", [os.path.join(example_yml_dir, "enforce-validate-options-good.yml")]) +@pytest.mark.parametrize( + "project_file", [os.path.join(example_yml_dir, "enforce-validate-options-good.yml")] +) def test_logic_validation_fail(project_file): try: BuildStockBatchBase.validate_logic(project_file) @@ -319,7 +342,9 @@ def test_logic_validation_fail(project_file): assert "'Vintage' occurs 2 times in a 'and' block" in er assert "'Vintage' occurs 2 times in a '&&' block" in er else: - raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml") + raise Exception( + "validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml" + ) @pytest.mark.parametrize( @@ -335,7 +360,9 @@ def test_number_of_options_apply_upgrade(): proj_filename = resstock_directory / "project_national" / "national_upgrades.yml" cfg = get_project_configuration(str(proj_filename)) cfg["upgrades"][-1]["options"] = cfg["upgrades"][-1]["options"] * 10 - cfg["upgrades"][0]["options"][0]["costs"] = cfg["upgrades"][0]["options"][0]["costs"] * 5 + cfg["upgrades"][0]["options"][0]["costs"] = ( + cfg["upgrades"][0]["options"][0]["costs"] * 5 + ) with tempfile.TemporaryDirectory() as tmpdir: tmppath = pathlib.Path(tmpdir) new_proj_filename = tmppath / "project.yml" @@ -437,11 +464,15 @@ def test_validate_apptainer_image(): temp_yml = pathlib.Path(tmpdir, "temp.yml") with open(temp_yml, "w") as f: yaml.dump(cfg, f, Dumper=yaml.SafeDumper) - with pytest.raises(ValidationError, match=r"Could not find apptainer image: .+\.sif or .+\.simg"): + with pytest.raises( + ValidationError, + match=r"Could not find apptainer image: .+\.sif or .+\.simg", + ): SlurmBatch.validate_apptainer_image_hpc(str(temp_yml)) for ext in ["Apptainer.sif", "Singularity.simg"]: filename = pathlib.Path( - tmpdir, f"OpenStudio-{SlurmBatch.DEFAULT_OS_VERSION}.{SlurmBatch.DEFAULT_OS_SHA}-{ext}" + tmpdir, + f"OpenStudio-{SlurmBatch.DEFAULT_OS_VERSION}.{SlurmBatch.DEFAULT_OS_SHA}-{ext}", ) filename.touch() SlurmBatch.validate_apptainer_image_hpc(str(temp_yml)) @@ -453,7 +484,11 @@ def test_validate_sampler_good_buildstock(basic_residential_project_file): { "sampler": { "type": "precomputed", - "args": {"sample_file": str(os.path.join(resources_dir, "buildstock_good.csv"))}, + "args": { + "sample_file": str( + os.path.join(resources_dir, "buildstock_good.csv") + ) + }, } } ) @@ -465,7 +500,11 @@ def test_validate_sampler_bad_buildstock(basic_residential_project_file): { "sampler": { "type": "precomputed", - "args": {"sample_file": str(os.path.join(resources_dir, "buildstock_bad.csv"))}, + "args": { + "sample_file": str( + os.path.join(resources_dir, "buildstock_bad.csv") + ) + }, } } ) @@ -473,10 +512,27 @@ def test_validate_sampler_bad_buildstock(basic_residential_project_file): BuildStockBatchBase.validate_sampler(project_filename) except ValidationError as er: er = str(er) - assert "Option 1940-1950 in column Vintage of buildstock_csv is not available in options_lookup.tsv" in er - assert "Option TX in column State of buildstock_csv is not available in options_lookup.tsv" in er - assert "Option nan in column Insulation Wall of buildstock_csv is not available in options_lookup.tsv" in er - assert "Column Insulation in buildstock_csv is not available in options_lookup.tsv" in er - assert "Column ZipPlusCode in buildstock_csv is not available in options_lookup.tsv" in er + assert ( + "Option 1940-1950 in column Vintage of buildstock_csv is not available in options_lookup.tsv" + in er + ) + assert ( + "Option TX in column State of buildstock_csv is not available in options_lookup.tsv" + in er + ) + assert ( + "Option nan in column Insulation Wall of buildstock_csv is not available in options_lookup.tsv" + in er + ) + assert ( + "Column Insulation in buildstock_csv is not available in options_lookup.tsv" + in er + ) + assert ( + "Column ZipPlusCode in buildstock_csv is not available in options_lookup.tsv" + in er + ) else: - raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml") + raise Exception( + "validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml" + ) diff --git a/buildstockbatch/utils.py b/buildstockbatch/utils.py index e9453b38..b975959b 100644 --- a/buildstockbatch/utils.py +++ b/buildstockbatch/utils.py @@ -46,12 +46,16 @@ def get_project_configuration(project_file): raise err # Set absolute paths - cfg["buildstock_directory"] = path_rel_to_file(project_file, cfg["buildstock_directory"]) + cfg["buildstock_directory"] = path_rel_to_file( + project_file, cfg["buildstock_directory"] + ) # if 'precomputed_sample' in cfg.get('baseline', {}): # cfg['baseline']['precomputed_sample'] = \ # path_rel_to_file(project_file, cfg['baseline']['precomputed_sample']) if "weather_files_path" in cfg: - cfg["weather_files_path"] = path_rel_to_file(project_file, cfg["weather_files_path"]) + cfg["weather_files_path"] = path_rel_to_file( + project_file, cfg["weather_files_path"] + ) return cfg @@ -65,20 +69,35 @@ def _str_repr(obj, list_max=20, dict_max=20, string_max=100): elif type(obj) in [int, float]: return _str_repr(str(obj), list_max, dict_max, string_max) elif type(obj) is list: - txt = "[" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:list_max]]) + txt = "[" + ",".join( + [ + _str_repr(item, list_max, dict_max, string_max) + for item in obj[0:list_max] + ] + ) if len(obj) > list_max: txt += f" ...{len(obj)}" txt += "]" return txt elif type(obj) is tuple: - txt = "(" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:list_max]]) + txt = "(" + ",".join( + [ + _str_repr(item, list_max, dict_max, string_max) + for item in obj[0:list_max] + ] + ) if len(obj) > list_max: txt += f" ...{len(obj)}" txt += ")" return txt elif type(obj) is set: obj = list(obj) - txt = "{" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:dict_max]]) + txt = "{" + ",".join( + [ + _str_repr(item, list_max, dict_max, string_max) + for item in obj[0:dict_max] + ] + ) if len(obj) > dict_max: txt += f" ...{len(obj)}" txt += "}" diff --git a/buildstockbatch/workflow_generator/commercial.py b/buildstockbatch/workflow_generator/commercial.py index 6495acfe..2fff78a2 100644 --- a/buildstockbatch/workflow_generator/commercial.py +++ b/buildstockbatch/workflow_generator/commercial.py @@ -49,7 +49,9 @@ def validate(cls, cfg): workflow_generator_args = cfg["workflow_generator"]["args"] schema_yml = re.sub(r"^ {8}", "", schema_yml, flags=re.MULTILINE) schema = yamale.make_schema(content=schema_yml, parser="ruamel") - data = yamale.make_data(content=json.dumps(workflow_generator_args), parser="ruamel") + data = yamale.make_data( + content=json.dumps(workflow_generator_args), parser="ruamel" + ) return yamale.validate(schema, data, strict=True) def reporting_measures(self): @@ -114,11 +116,17 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "arguments": {"run_measure": 1}, } if "upgrade_name" in measure_d: - apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d["upgrade_name"] + apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d[ + "upgrade_name" + ] for opt_num, option in enumerate(measure_d["options"], 1): - apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = option["option"] + apply_upgrade_measure["arguments"][ + "option_{}".format(opt_num) + ] = option["option"] if "lifetime" in option: - apply_upgrade_measure["arguments"]["option_{}_lifetime".format(opt_num)] = option["lifetime"] + apply_upgrade_measure["arguments"][ + "option_{}_lifetime".format(opt_num) + ] = option["lifetime"] if "apply_logic" in option: apply_upgrade_measure["arguments"][ "option_{}_apply_logic".format(opt_num) @@ -131,9 +139,9 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) ] = cost[arg] if "package_apply_logic" in measure_d: - apply_upgrade_measure["arguments"]["package_apply_logic"] = self.make_apply_logic_arg( - measure_d["package_apply_logic"] - ) + apply_upgrade_measure["arguments"][ + "package_apply_logic" + ] = self.make_apply_logic_arg(measure_d["package_apply_logic"]) build_existing_model_idx = list( map( diff --git a/buildstockbatch/workflow_generator/residential_hpxml.py b/buildstockbatch/workflow_generator/residential_hpxml.py index 71cab179..ee71b6a1 100644 --- a/buildstockbatch/workflow_generator/residential_hpxml.py +++ b/buildstockbatch/workflow_generator/residential_hpxml.py @@ -145,14 +145,18 @@ def validate(cls, cfg): workflow_generator_args = cfg["workflow_generator"]["args"] schema_yml = re.sub(r"^ {8}", "", schema_yml, flags=re.MULTILINE) schema = yamale.make_schema(content=schema_yml, parser="ruamel") - data = yamale.make_data(content=json.dumps(workflow_generator_args), parser="ruamel") + data = yamale.make_data( + content=json.dumps(workflow_generator_args), parser="ruamel" + ) yamale.validate(schema, data, strict=True) return cls.validate_measures_and_arguments(cfg) def reporting_measures(self): """Return a list of reporting measures to include in outputs""" workflow_args = self.cfg["workflow_generator"].get("args", {}) - return [x["measure_dir_name"] for x in workflow_args.get("reporting_measures", [])] + return [ + x["measure_dir_name"] for x in workflow_args.get("reporting_measures", []) + ] @staticmethod def validate_measures_and_arguments(cfg): @@ -191,7 +195,9 @@ def get_cfg_path(cfg_path): workflow_args = cfg["workflow_generator"].get("args", {}) if "reporting_measures" in workflow_args.keys(): for reporting_measure in workflow_args["reporting_measures"]: - measure_names[reporting_measure["measure_dir_name"]] = "workflow_generator.args.reporting_measures" + measure_names[ + reporting_measure["measure_dir_name"] + ] = "workflow_generator.args.reporting_measures" error_msgs = "" warning_msgs = "" @@ -224,7 +230,9 @@ def get_cfg_path(cfg_path): error_msgs += "* The following multipliers values are invalid: \n" for multiplier, count in invalid_multipliers.items(): error_msgs += f" '{multiplier}' - Used {count} times \n" - error_msgs += f" The list of valid multipliers are {valid_multipliers}.\n" + error_msgs += ( + f" The list of valid multipliers are {valid_multipliers}.\n" + ) if warning_msgs: logger.warning(warning_msgs) @@ -266,7 +274,8 @@ def create_osw(self, sim_id, building_id, upgrade_idx): bld_exist_model_args = { "building_id": building_id, - "sample_weight": self.cfg["baseline"]["n_buildings_represented"] / self.n_datapoints, + "sample_weight": self.cfg["baseline"]["n_buildings_represented"] + / self.n_datapoints, } bld_exist_model_args.update(sim_ctl_args) @@ -289,12 +298,16 @@ def create_osw(self, sim_id, building_id, upgrade_idx): ["emissions_wood_values", "wood_value"], ] for arg, item in emissions_map: - bld_exist_model_args[arg] = ",".join([str(s.get(item, "")) for s in emissions]) + bld_exist_model_args[arg] = ",".join( + [str(s.get(item, "")) for s in emissions] + ) buildstock_dir = self.cfg["buildstock_directory"] measures_dir = os.path.join(buildstock_dir, "measures") measure_path = os.path.join(measures_dir, "BuildExistingModel") - bld_exist_model_args_avail = get_measure_arguments(os.path.join(measure_path, "measure.xml")) + bld_exist_model_args_avail = get_measure_arguments( + os.path.join(measure_path, "measure.xml") + ) if "utility_bills" in workflow_args: utility_bills = workflow_args["utility_bills"] @@ -333,7 +346,9 @@ def create_osw(self, sim_id, building_id, upgrade_idx): ] for arg, item in utility_bills_map: if arg in bld_exist_model_args_avail: - bld_exist_model_args[arg] = ",".join([str(s.get(item, "")) for s in utility_bills]) + bld_exist_model_args[arg] = ",".join( + [str(s.get(item, "")) for s in utility_bills] + ) sim_out_rep_args = { "timeseries_frequency": "none", @@ -356,7 +371,9 @@ def create_osw(self, sim_id, building_id, upgrade_idx): measures_dir = os.path.join(buildstock_dir, "resources/hpxml-measures") measure_path = os.path.join(measures_dir, "ReportSimulationOutput") - sim_out_rep_args_avail = get_measure_arguments(os.path.join(measure_path, "measure.xml")) + sim_out_rep_args_avail = get_measure_arguments( + os.path.join(measure_path, "measure.xml") + ) if "include_annual_total_consumptions" in sim_out_rep_args_avail: sim_out_rep_args["include_annual_total_consumptions"] = True @@ -419,14 +436,18 @@ def create_osw(self, sim_id, building_id, upgrade_idx): if "output_variables" in sim_out_rep_args: output_variables = sim_out_rep_args["output_variables"] - sim_out_rep_args["user_output_variables"] = ",".join([str(s.get("name")) for s in output_variables]) + sim_out_rep_args["user_output_variables"] = ",".join( + [str(s.get("name")) for s in output_variables] + ) sim_out_rep_args.pop("output_variables") util_bills_rep_args = {} measures_dir = os.path.join(buildstock_dir, "resources/hpxml-measures") measure_path = os.path.join(measures_dir, "ReportUtilityBills") - util_bills_rep_args_avail = get_measure_arguments(os.path.join(measure_path, "measure.xml")) + util_bills_rep_args_avail = get_measure_arguments( + os.path.join(measure_path, "measure.xml") + ) if "include_annual_bills" in util_bills_rep_args_avail: util_bills_rep_args["include_annual_bills"] = True @@ -517,11 +538,17 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "arguments": {"run_measure": 1}, } if "upgrade_name" in measure_d: - apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d["upgrade_name"] + apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d[ + "upgrade_name" + ] for opt_num, option in enumerate(measure_d["options"], 1): - apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = option["option"] + apply_upgrade_measure["arguments"][ + "option_{}".format(opt_num) + ] = option["option"] if "lifetime" in option: - apply_upgrade_measure["arguments"]["option_{}_lifetime".format(opt_num)] = option["lifetime"] + apply_upgrade_measure["arguments"][ + "option_{}_lifetime".format(opt_num) + ] = option["lifetime"] if "apply_logic" in option: apply_upgrade_measure["arguments"][ "option_{}_apply_logic".format(opt_num) @@ -534,11 +561,13 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) ] = cost[arg] if "package_apply_logic" in measure_d: - apply_upgrade_measure["arguments"]["package_apply_logic"] = self.make_apply_logic_arg( - measure_d["package_apply_logic"] - ) + apply_upgrade_measure["arguments"][ + "package_apply_logic" + ] = self.make_apply_logic_arg(measure_d["package_apply_logic"]) - build_existing_model_idx = [x["measure_dir_name"] == "BuildExistingModel" for x in osw["steps"]].index(True) + build_existing_model_idx = [ + x["measure_dir_name"] == "BuildExistingModel" for x in osw["steps"] + ].index(True) osw["steps"].insert(build_existing_model_idx + 1, apply_upgrade_measure) if "reporting_measures" in workflow_args: @@ -546,6 +575,8 @@ def create_osw(self, sim_id, building_id, upgrade_idx): if "arguments" not in reporting_measure: reporting_measure["arguments"] = {} reporting_measure["measure_type"] = "ReportingMeasure" - osw["steps"].insert(-1, reporting_measure) # right before ServerDirectoryCleanup + osw["steps"].insert( + -1, reporting_measure + ) # right before ServerDirectoryCleanup return osw diff --git a/buildstockbatch/workflow_generator/test_workflow_generator.py b/buildstockbatch/workflow_generator/test_workflow_generator.py index bd61c46a..9a49eaea 100644 --- a/buildstockbatch/workflow_generator/test_workflow_generator.py +++ b/buildstockbatch/workflow_generator/test_workflow_generator.py @@ -12,10 +12,14 @@ def test_apply_logic_recursion(): apply_logic = WorkflowGeneratorBase.make_apply_logic_arg(["one", "two", "three"]) assert apply_logic == "(one&&two&&three)" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"and": ["one", "two", "three"]}) + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg( + {"and": ["one", "two", "three"]} + ) assert apply_logic == "(one&&two&&three)" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"or": ["four", "five", "six"]}) + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg( + {"or": ["four", "five", "six"]} + ) assert apply_logic == "(four||five||six)" apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"not": "seven"}) @@ -72,11 +76,36 @@ def test_residential_hpxml(mocker): build_existing_model_step = steps[0] assert build_existing_model_step["measure_dir_name"] == "BuildExistingModel" - assert build_existing_model_step["arguments"]["simulation_control_run_period_begin_month"] == 2 - assert build_existing_model_step["arguments"]["simulation_control_run_period_begin_day_of_month"] == 1 - assert build_existing_model_step["arguments"]["simulation_control_run_period_end_month"] == 2 - assert build_existing_model_step["arguments"]["simulation_control_run_period_end_day_of_month"] == 28 - assert build_existing_model_step["arguments"]["simulation_control_run_period_calendar_year"] == 2010 + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_begin_month" + ] + == 2 + ) + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_begin_day_of_month" + ] + == 1 + ) + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_end_month" + ] + == 2 + ) + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_end_day_of_month" + ] + == 28 + ) + assert ( + build_existing_model_step["arguments"][ + "simulation_control_run_period_calendar_year" + ] + == 2010 + ) apply_upgrade_step = steps[1] assert apply_upgrade_step["measure_dir_name"] == "ApplyUpgrade" @@ -87,13 +116,25 @@ def test_residential_hpxml(mocker): simulation_output_step = steps[3] assert simulation_output_step["measure_dir_name"] == "ReportSimulationOutput" assert simulation_output_step["arguments"]["timeseries_frequency"] == "hourly" - assert simulation_output_step["arguments"]["include_annual_total_consumptions"] is True - assert simulation_output_step["arguments"]["include_annual_fuel_consumptions"] is True - assert simulation_output_step["arguments"]["include_annual_end_use_consumptions"] is True - assert simulation_output_step["arguments"]["include_annual_system_use_consumptions"] is False + assert ( + simulation_output_step["arguments"]["include_annual_total_consumptions"] is True + ) + assert ( + simulation_output_step["arguments"]["include_annual_fuel_consumptions"] is True + ) + assert ( + simulation_output_step["arguments"]["include_annual_end_use_consumptions"] + is True + ) + assert ( + simulation_output_step["arguments"]["include_annual_system_use_consumptions"] + is False + ) assert simulation_output_step["arguments"]["include_annual_emissions"] is True assert simulation_output_step["arguments"]["include_annual_emission_fuels"] is True - assert simulation_output_step["arguments"]["include_annual_emission_end_uses"] is True + assert ( + simulation_output_step["arguments"]["include_annual_emission_end_uses"] is True + ) assert simulation_output_step["arguments"]["include_annual_total_loads"] is True assert simulation_output_step["arguments"]["include_annual_unmet_hours"] is True assert simulation_output_step["arguments"]["include_annual_peak_fuels"] is True @@ -102,22 +143,55 @@ def test_residential_hpxml(mocker): assert simulation_output_step["arguments"]["include_annual_hot_water_uses"] is True assert simulation_output_step["arguments"]["include_annual_hvac_summary"] is True assert simulation_output_step["arguments"]["include_annual_resilience"] is True - assert simulation_output_step["arguments"]["include_timeseries_total_consumptions"] is True - assert simulation_output_step["arguments"]["include_timeseries_fuel_consumptions"] is False - assert simulation_output_step["arguments"]["include_timeseries_end_use_consumptions"] is True - assert simulation_output_step["arguments"]["include_timeseries_system_use_consumptions"] is False + assert ( + simulation_output_step["arguments"]["include_timeseries_total_consumptions"] + is True + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_fuel_consumptions"] + is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_end_use_consumptions"] + is True + ) + assert ( + simulation_output_step["arguments"][ + "include_timeseries_system_use_consumptions" + ] + is False + ) assert simulation_output_step["arguments"]["include_timeseries_emissions"] is False - assert simulation_output_step["arguments"]["include_timeseries_emission_fuels"] is False - assert simulation_output_step["arguments"]["include_timeseries_emission_end_uses"] is False - assert simulation_output_step["arguments"]["include_timeseries_hot_water_uses"] is False + assert ( + simulation_output_step["arguments"]["include_timeseries_emission_fuels"] + is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_emission_end_uses"] + is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_hot_water_uses"] + is False + ) assert simulation_output_step["arguments"]["include_timeseries_total_loads"] is True - assert simulation_output_step["arguments"]["include_timeseries_component_loads"] is False - assert simulation_output_step["arguments"]["include_timeseries_unmet_hours"] is False - assert simulation_output_step["arguments"]["include_timeseries_zone_temperatures"] is False + assert ( + simulation_output_step["arguments"]["include_timeseries_component_loads"] + is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_unmet_hours"] is False + ) + assert ( + simulation_output_step["arguments"]["include_timeseries_zone_temperatures"] + is False + ) assert simulation_output_step["arguments"]["include_timeseries_airflows"] is False assert simulation_output_step["arguments"]["include_timeseries_weather"] is False assert simulation_output_step["arguments"]["include_timeseries_resilience"] is False - assert simulation_output_step["arguments"]["timeseries_timestamp_convention"] == "end" + assert ( + simulation_output_step["arguments"]["timeseries_timestamp_convention"] == "end" + ) assert simulation_output_step["arguments"]["timeseries_num_decimal_places"] == 3 assert simulation_output_step["arguments"]["add_timeseries_dst_column"] is True assert simulation_output_step["arguments"]["add_timeseries_utc_column"] is True @@ -259,7 +333,9 @@ def test_com_default_workflow_generator_extended(mocker): assert reporting_measure_step["measure_type"] == "ReportingMeasure" assert reporting_measure_step["arguments"] == {} # Should only be one instance of SimulationOutputReport - assert [d["measure_dir_name"] == "SimulationOutputReport" for d in osw["steps"]].count(True) == 1 + assert [ + d["measure_dir_name"] == "SimulationOutputReport" for d in osw["steps"] + ].count(True) == 1 # Should get TimeseriesCSVExport if included in args reporting_measure_step = osw["steps"][1] assert reporting_measure_step["measure_dir_name"] == "TimeseriesCSVExport" @@ -268,7 +344,10 @@ def test_com_default_workflow_generator_extended(mocker): assert reporting_measure_step["arguments"]["inc_output_variables"] == "true" # Should have the openstudio report reporting_measure_step = osw["steps"][2] - assert reporting_measure_step["measure_dir_name"] == "f8e23017-894d-4bdf-977f-37e3961e6f42" + assert ( + reporting_measure_step["measure_dir_name"] + == "f8e23017-894d-4bdf-977f-37e3961e6f42" + ) assert reporting_measure_step["measure_type"] == "ReportingMeasure" assert reporting_measure_step["arguments"]["building_summary_section"] == "true" assert reporting_measure_step["arguments"]["schedules_overview_section"] == "true" diff --git a/docs/conf.py b/docs/conf.py index 94ca7931..45c44c52 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,7 +20,9 @@ here = os.path.abspath(os.path.dirname(__file__)) metadata = {} -with open(os.path.join(here, "..", "buildstockbatch", "__version__.py"), "r", encoding="utf-8") as f: +with open( + os.path.join(here, "..", "buildstockbatch", "__version__.py"), "r", encoding="utf-8" +) as f: exec(f.read(), metadata) # -- Project information ----------------------------------------------------- @@ -73,7 +75,9 @@ # how to render changelog links changelog_render_ticket = "http://www.github.com/nrel/buildstockbatch/issues/%s" -changelog_render_pullreq = {"default": "https://www.github.com/nrel/buildstockbatch/pull/%s"} +changelog_render_pullreq = { + "default": "https://www.github.com/nrel/buildstockbatch/pull/%s" +} # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -176,7 +180,9 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "buildstockbatch", "BuildStock Batch Documentation", [author], 1)] +man_pages = [ + (master_doc, "buildstockbatch", "BuildStock Batch Documentation", [author], 1) +] # -- Options for Texinfo output ---------------------------------------------- diff --git a/setup.py b/setup.py index fd06bdc0..669dd707 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,9 @@ here = os.path.abspath(os.path.dirname(__file__)) metadata = {} -with open(os.path.join(here, "buildstockbatch", "__version__.py"), "r", encoding="utf-8") as f: +with open( + os.path.join(here, "buildstockbatch", "__version__.py"), "r", encoding="utf-8" +) as f: exec(f.read(), metadata) with open("README.md", "r", "utf-8") as f: From 963b821e7546ebaaec42e9106ca6778eae644c49 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 16:52:51 -0700 Subject: [PATCH 41/53] Revert "removing black config in pyproject.toml" This reverts commit f695f116d77935b86e7759c8e49feb42f273bb75. --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..55ec8d78 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.black] +line-length = 120 From 38a3b6ce716ce3cf8e92c68927de58eeddb377a8 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 16:53:09 -0700 Subject: [PATCH 42/53] Revert "switching back to the default line length" This reverts commit 511d733adcbfb4d2453f0f52c346ce5365a227b6. --- buildstockbatch/__version__.py | 4 +- buildstockbatch/aws/aws.py | 284 +++++------------- buildstockbatch/aws/awsbase.py | 38 +-- buildstockbatch/base.py | 260 ++++------------ buildstockbatch/cloud/docker_base.py | 49 +-- buildstockbatch/hpc.py | 178 +++-------- buildstockbatch/local.py | 75 ++--- buildstockbatch/postprocessing.py | 238 ++++----------- buildstockbatch/sampler/base.py | 12 +- buildstockbatch/sampler/commercial_sobol.py | 40 +-- buildstockbatch/sampler/downselect.py | 31 +- buildstockbatch/sampler/residential_quota.py | 9 +- buildstockbatch/test/conftest.py | 18 +- buildstockbatch/test/shared_testing_stuff.py | 8 +- buildstockbatch/test/test_base.py | 90 ++---- buildstockbatch/test/test_docker_base.py | 44 +-- buildstockbatch/test/test_hpc.py | 150 ++------- buildstockbatch/test/test_local.py | 37 +-- buildstockbatch/test/test_postprocessing.py | 26 +- buildstockbatch/test/test_validation.py | 104 ++----- buildstockbatch/utils.py | 29 +- .../workflow_generator/commercial.py | 22 +- .../workflow_generator/residential_hpxml.py | 69 ++--- .../test_workflow_generator.py | 129 ++------ docs/conf.py | 12 +- setup.py | 4 +- 26 files changed, 458 insertions(+), 1502 deletions(-) diff --git a/buildstockbatch/__version__.py b/buildstockbatch/__version__.py index d32e165c..552e75f2 100644 --- a/buildstockbatch/__version__.py +++ b/buildstockbatch/__version__.py @@ -9,6 +9,4 @@ __author__ = "Noel Merket" __author_email__ = "noel.merket@nrel.gov" __license__ = "BSD-3" -__copyright__ = "Copyright {} The Alliance for Sustainable Energy".format( - dt.date.today().year -) +__copyright__ = "Copyright {} The Alliance for Sustainable Energy".format(dt.date.today().year) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index a812d29f..ea0a5e2a 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -62,9 +62,7 @@ def backoff(thefunc, *args, **kwargs): caught_error = False for pat in error_patterns: if re.search(pat, error_code): - logger.debug( - f"{error_code}: Waiting and retrying in {delay} seconds" - ) + logger.debug(f"{error_code}: Waiting and retrying in {delay} seconds") caught_error = True time.sleep(delay) delay *= backoff_mult @@ -90,9 +88,7 @@ def filename_generator(): if filename.startswith("."): continue local_filepath = pathlib.Path(dirpath, filename) - s3_key = pathlib.PurePosixPath( - prefix, local_filepath.relative_to(local_dir_abs) - ) + s3_key = pathlib.PurePosixPath(prefix, local_filepath.relative_to(local_dir_abs)) yield local_filepath, s3_key logger.debug("Uploading {} => {}/{}".format(local_dir_abs, bucket, prefix)) @@ -135,9 +131,7 @@ def __init__(self, job_name, aws_config, boto3_session): self.batch = self.session.client("batch", config=boto_client_config) self.ec2 = self.session.client("ec2", config=boto_client_config) self.ec2r = self.session.resource("ec2", config=boto_client_config) - self.step_functions = self.session.client( - "stepfunctions", config=boto_client_config - ) + self.step_functions = self.session.client("stepfunctions", config=boto_client_config) self.aws_lambda = self.session.client("lambda", config=boto_client_config) self.s3 = self.session.client("s3", config=boto_client_config) self.s3_res = self.session.resource("s3", config=boto_client_config) @@ -272,9 +266,7 @@ def create_vpc(self): # Create the public subnet - pub_response = self.ec2.create_subnet( - CidrBlock=self.pub_subnet_cidr, VpcId=self.vpc_id - ) + pub_response = self.ec2.create_subnet(CidrBlock=self.pub_subnet_cidr, VpcId=self.vpc_id) logger.info("EIP allocated.") @@ -306,9 +298,7 @@ def create_vpc(self): # Create an internet gateway - self.ec2.attach_internet_gateway( - InternetGatewayId=self.internet_gateway_id, VpcId=self.vpc_id - ) + self.ec2.attach_internet_gateway(InternetGatewayId=self.internet_gateway_id, VpcId=self.vpc_id) logger.info("Internet Gateway attached.") @@ -343,9 +333,7 @@ def create_vpc(self): # Create a NAT Gateway - nat_response = self.ec2.create_nat_gateway( - AllocationId=self.nat_ip_allocation, SubnetId=self.pub_vpc_subnet_id - ) + nat_response = self.ec2.create_nat_gateway(AllocationId=self.nat_ip_allocation, SubnetId=self.pub_vpc_subnet_id) self.nat_gateway_id = nat_response["NatGateway"]["NatGatewayId"] @@ -373,14 +361,10 @@ def create_vpc(self): # Associate the private route to the private subnet - self.ec2.associate_route_table( - RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_1 - ) + self.ec2.associate_route_table(RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_1) logger.info("Route table associated with subnet.") - self.ec2.associate_route_table( - RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_2 - ) + self.ec2.associate_route_table(RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_2) logger.info("Route table associated with subnet.") # Associate the NAT gateway with the private route @@ -439,9 +423,7 @@ def create_batch_service_roles(self): self.batch_service_role_name, "batch", f"Service role for Batch environment {self.job_identifier}", - managed_policie_arns=[ - "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole" - ], + managed_policie_arns=["arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole"], ) # Instance Role for Batch compute environment @@ -450,17 +432,13 @@ def create_batch_service_roles(self): self.batch_instance_role_name, "ec2", f"Instance role for Batch compute environment {self.job_identifier}", - managed_policie_arns=[ - "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" - ], + managed_policie_arns=["arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"], ) # Instance Profile try: - response = self.iam.create_instance_profile( - InstanceProfileName=self.batch_instance_profile_name - ) + response = self.iam.create_instance_profile(InstanceProfileName=self.batch_instance_profile_name) self.instance_profile_arn = response["InstanceProfile"]["Arn"] @@ -474,9 +452,7 @@ def create_batch_service_roles(self): except Exception as e: if "EntityAlreadyExists" in str(e): logger.info("ECS Instance Profile not created - already exists") - response = self.iam.get_instance_profile( - InstanceProfileName=self.batch_instance_profile_name - ) + response = self.iam.get_instance_profile(InstanceProfileName=self.batch_instance_profile_name) self.instance_profile_arn = response["InstanceProfile"]["Arn"] # ECS Task Policy @@ -581,9 +557,7 @@ def create_batch_service_roles(self): self.batch_spot_service_role_name, "spotfleet", f"Spot Fleet role for Batch compute environment {self.job_identifier}", - managed_policie_arns=[ - "arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole" - ], + managed_policie_arns=["arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole"], ) def create_compute_environment(self, maxCPUs=10000): @@ -608,18 +582,13 @@ def create_compute_environment(self, maxCPUs=10000): }, ) except ClientError as error: - if ( - error.response["Error"]["Code"] - == "InvalidLaunchTemplateName.AlreadyExistsException" - ): + if error.response["Error"]["Code"] == "InvalidLaunchTemplateName.AlreadyExistsException": logger.debug("Launch template exists, skipping creation") else: raise error while True: - lt_resp = self.ec2.describe_launch_templates( - LaunchTemplateNames=[self.launch_template_name] - ) + lt_resp = self.ec2.describe_launch_templates(LaunchTemplateNames=[self.launch_template_name]) launch_templates = lt_resp["LaunchTemplates"] next_token = lt_resp.get("NextToken") while next_token: @@ -630,13 +599,9 @@ def create_compute_environment(self, maxCPUs=10000): launch_templates.extend(lt_resp["LaunchTemplates"]) next_token = lt_resp.get("NextToken") n_launch_templates = len(launch_templates) - assert ( - n_launch_templates <= 1 - ), f"There are {n_launch_templates} launch templates, this shouldn't happen." + assert n_launch_templates <= 1, f"There are {n_launch_templates} launch templates, this shouldn't happen." if n_launch_templates == 0: - logger.debug( - f"Waiting for the launch template {self.launch_template_name} to be created" - ) + logger.debug(f"Waiting for the launch template {self.launch_template_name} to be created") time.sleep(5) if n_launch_templates == 1: break @@ -668,9 +633,7 @@ def create_compute_environment(self, maxCPUs=10000): else: compute_resources["type"] = "EC2" - compute_resources["tags"] = self.get_tags( - Name=f"{self.job_identifier} batch instance" - ) + compute_resources["tags"] = self.get_tags(Name=f"{self.job_identifier} batch instance") self.batch.create_compute_environment( computeEnvironmentName=self.batch_compute_environment_name, @@ -681,15 +644,11 @@ def create_compute_environment(self, maxCPUs=10000): tags=self.get_tags(), ) - logger.info( - f"Compute environment {self.batch_compute_environment_name} created." - ) + logger.info(f"Compute environment {self.batch_compute_environment_name} created.") except Exception as e: if "Object already exists" in str(e): - logger.info( - f"Compute environment {self.batch_compute_environment_name} not created - already exists" - ) + logger.info(f"Compute environment {self.batch_compute_environment_name} not created - already exists") else: raise @@ -721,9 +680,7 @@ def create_job_queue(self): except Exception as e: if "Object already exists" in str(e): - logger.info( - f"Job queue {self.batch_job_queue_name} not created - already exists" - ) + logger.info(f"Job queue {self.batch_job_queue_name} not created - already exists") response = self.batch.describe_job_queues( jobQueues=[ self.batch_job_queue_name, @@ -734,10 +691,7 @@ def create_job_queue(self): elif "is not valid" in str(e): # Need to wait a second for the compute environment to complete registration - logger.warning( - "wating a few seconds for compute environment creation: " - + str(e) - ) + logger.warning("wating a few seconds for compute environment creation: " + str(e)) time.sleep(5) else: @@ -793,10 +747,7 @@ def submit_job(self, array_size=4): except Exception as e: if "not in VALID state" in str(e): # Need to wait a second for the compute environment to complete registration - logger.warning( - "5 second sleep initiated to wait for job queue creation due to error: " - + str(e) - ) + logger.warning("5 second sleep initiated to wait for job queue creation due to error: " + str(e)) time.sleep(5) else: raise @@ -836,35 +787,25 @@ def clean(self): default_group_id = group["GroupId"] dsg = self.ec2r.SecurityGroup(default_group_id) if len(dsg.ip_permissions_egress): - response = dsg.revoke_egress( - IpPermissions=dsg.ip_permissions_egress - ) + response = dsg.revoke_egress(IpPermissions=dsg.ip_permissions_egress) try: - self.batch.update_job_queue( - jobQueue=self.batch_job_queue_name, state="DISABLED" - ) + self.batch.update_job_queue(jobQueue=self.batch_job_queue_name, state="DISABLED") while True: try: - response = self.batch.delete_job_queue( - jobQueue=self.batch_job_queue_name - ) + response = self.batch.delete_job_queue(jobQueue=self.batch_job_queue_name) logger.info(f"Job queue {self.batch_job_queue_name} deleted.") break except Exception as e: if "Cannot delete, resource is being modified" in str(e): - logger.info( - "Job queue being modified - sleeping until ready..." - ) + logger.info("Job queue being modified - sleeping until ready...") time.sleep(5) else: raise except Exception as e: if "does not exist" in str(e): - logger.info( - f"Job queue {self.batch_job_queue_name} missing, skipping..." - ) + logger.info(f"Job queue {self.batch_job_queue_name} missing, skipping...") # Delete compute enviornment @@ -878,38 +819,26 @@ def clean(self): response = self.batch.delete_compute_environment( computeEnvironment=self.batch_compute_environment_name ) - logger.info( - f"Compute environment {self.batch_compute_environment_name} deleted." - ) + logger.info(f"Compute environment {self.batch_compute_environment_name} deleted.") break except Exception as e: - if "Cannot delete, resource is being modified" in str( - e - ) or "found existing JobQueue" in str(e): - logger.info( - "Compute environment being modified - sleeping until ready..." - ) + if "Cannot delete, resource is being modified" in str(e) or "found existing JobQueue" in str(e): + logger.info("Compute environment being modified - sleeping until ready...") time.sleep(5) else: raise except Exception as e: if "does not exist" in str(e): - logger.info( - f"Compute environment {self.batch_compute_environment_name} missing, skipping..." - ) + logger.info(f"Compute environment {self.batch_compute_environment_name} missing, skipping...") else: raise # Delete Launch Template try: - self.ec2.delete_launch_template( - LaunchTemplateName=self.launch_template_name - ) + self.ec2.delete_launch_template(LaunchTemplateName=self.launch_template_name) except Exception as e: if "does not exist" in str(e): - logger.info( - f"Launch template {self.launch_template_name} does not exist, skipping..." - ) + logger.info(f"Launch template {self.launch_template_name} does not exist, skipping...") else: raise @@ -917,9 +846,7 @@ def clean(self): self.iam_helper.delete_role(self.batch_spot_service_role_name) self.iam_helper.delete_role(self.batch_ecs_task_role_name) # Instance profile order of removal - self.iam_helper.remove_role_from_instance_profile( - self.batch_instance_profile_name - ) + self.iam_helper.remove_role_from_instance_profile(self.batch_instance_profile_name) self.iam_helper.delete_role(self.batch_instance_role_name) self.iam_helper.delete_instance_profile(self.batch_instance_profile_name) @@ -939,9 +866,7 @@ def clean(self): for vpc in response["Vpcs"]: this_vpc = vpc["VpcId"] - s3gw_response = self.ec2.describe_vpc_endpoints( - Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] - ) + s3gw_response = self.ec2.describe_vpc_endpoints(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) for s3gw in s3gw_response["VpcEndpoints"]: this_s3gw = s3gw["VpcEndpointId"] @@ -949,9 +874,7 @@ def clean(self): if s3gw["State"] != "deleted": self.ec2.delete_vpc_endpoints(VpcEndpointIds=[this_s3gw]) - ng_response = self.ec2.describe_nat_gateways( - Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] - ) + ng_response = self.ec2.describe_nat_gateways(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) for natgw in ng_response["NatGateways"]: this_natgw = natgw["NatGatewayId"] @@ -959,9 +882,7 @@ def clean(self): if natgw["State"] != "deleted": self.ec2.delete_nat_gateway(NatGatewayId=this_natgw) - rtas_response = self.ec2.describe_route_tables( - Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] - ) + rtas_response = self.ec2.describe_route_tables(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) for route_table in rtas_response["RouteTables"]: route_table_id = route_table["RouteTableId"] @@ -973,9 +894,7 @@ def clean(self): rt_counter = 10 while rt_counter: try: - response = self.ec2.delete_route_table( - RouteTableId=route_table_id - ) + response = self.ec2.delete_route_table(RouteTableId=route_table_id) logger.info("Route table removed.") break except Exception as e: @@ -999,20 +918,14 @@ def clean(self): try: try: self.ec2.detach_internet_gateway( - InternetGatewayId=internet_gateway[ - "InternetGatewayId" - ], + InternetGatewayId=internet_gateway["InternetGatewayId"], VpcId=attachment["VpcId"], ) except Exception as e: - logger.info( - f"Error on Internet Gateway disassociation - ignoring... {str(e)}" - ) + logger.info(f"Error on Internet Gateway disassociation - ignoring... {str(e)}") self.ec2.delete_internet_gateway( - InternetGatewayId=internet_gateway[ - "InternetGatewayId" - ] + InternetGatewayId=internet_gateway["InternetGatewayId"] ) logger.info("Internet Gateway deleted.") break @@ -1026,9 +939,7 @@ def clean(self): else: raise - subn_response = self.ec2.describe_subnets( - Filters=[{"Name": "vpc-id", "Values": [this_vpc]}] - ) + subn_response = self.ec2.describe_subnets(Filters=[{"Name": "vpc-id", "Values": [this_vpc]}]) for subnet in subn_response["Subnets"]: while True: @@ -1037,9 +948,7 @@ def clean(self): break except Exception as e: if "DependencyViolation" in str(e): - logger.info( - "Subnet cannot be deleted as dependencies are still being deleted. Sleeping..." - ) + logger.info("Subnet cannot be deleted as dependencies are still being deleted. Sleeping...") time.sleep(10) else: raise @@ -1075,15 +984,11 @@ class AwsBatch(DockerBatchBase): def __init__(self, project_filename): super().__init__(project_filename) - self.job_identifier = re.sub( - "[^0-9a-zA-Z]+", "_", self.cfg["aws"]["job_identifier"] - )[:10] + self.job_identifier = re.sub("[^0-9a-zA-Z]+", "_", self.cfg["aws"]["job_identifier"])[:10] self.project_filename = project_filename self.region = self.cfg["aws"]["region"] - self.ecr = boto3.client( - "ecr", region_name=self.region, config=boto_client_config - ) + self.ecr = boto3.client("ecr", region_name=self.region, config=boto_client_config) self.s3 = boto3.client("s3", region_name=self.region, config=boto_client_config) self.s3_bucket = self.cfg["aws"]["s3"]["bucket"] self.s3_bucket_prefix = self.cfg["aws"]["s3"]["prefix"].rstrip("/") @@ -1095,9 +1000,7 @@ def __init__(self, project_filename): def validate_dask_settings(project_file): cfg = get_project_configuration(project_file) if "emr" in cfg["aws"]: - logger.warning( - "The `aws.emr` configuration is no longer used and is ignored. Recommend removing." - ) + logger.warning("The `aws.emr` configuration is no longer used and is ignored. Recommend removing.") dask_cfg = cfg["aws"]["dask"] errors = [] mem_rules = { @@ -1110,22 +1013,16 @@ def validate_dask_settings(project_file): for node_type in ("scheduler", "worker"): mem = dask_cfg.get(f"{node_type}_memory", 8 * 1024) if mem % 1024 != 0: - errors.append( - f"`aws.dask.{node_type}_memory` = {mem}, needs to be a multiple of 1024." - ) + errors.append(f"`aws.dask.{node_type}_memory` = {mem}, needs to be a multiple of 1024.") mem_gb = mem // 1024 - min_gb, max_gb, incr_gb = mem_rules[ - dask_cfg.get(f"{node_type}_cpu", 2 * 1024) - ] + min_gb, max_gb, incr_gb = mem_rules[dask_cfg.get(f"{node_type}_cpu", 2 * 1024)] if not (min_gb <= mem_gb <= max_gb and (mem_gb - min_gb) % incr_gb == 0): errors.append( f"`aws.dask.{node_type}_memory` = {mem}, " f"should be between {min_gb * 1024} and {max_gb * 1024} in a multiple of {incr_gb * 1024}." ) if errors: - errors.append( - "See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html" - ) + errors.append("See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html") raise ValidationError("\n".join(errors)) return True @@ -1174,32 +1071,22 @@ def build_image(self): """ root_path = pathlib.Path(os.path.abspath(__file__)).parent.parent.parent if not (root_path / "Dockerfile").exists(): - raise RuntimeError( - f"The needs to be run from the root of the repo, found {root_path}" - ) + raise RuntimeError(f"The needs to be run from the root of the repo, found {root_path}") # Make the buildstock/resources/.aws_docker_image dir to store logs - local_log_dir = os.path.join( - self.buildstock_dir, "resources", ".aws_docker_image" - ) + local_log_dir = os.path.join(self.buildstock_dir, "resources", ".aws_docker_image") if not os.path.exists(local_log_dir): os.makedirs(local_log_dir) # Determine whether or not to build the image with custom gems bundled in if self.cfg.get("baseline", dict()).get("custom_gems", False): # Ensure the custom Gemfile exists in the buildstock dir - local_gemfile_path = os.path.join( - self.buildstock_dir, "resources", "Gemfile" - ) + local_gemfile_path = os.path.join(self.buildstock_dir, "resources", "Gemfile") if not os.path.exists(local_gemfile_path): - raise AttributeError( - f"baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}" - ) + raise AttributeError(f"baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}") # Copy the custom Gemfile into the buildstockbatch repo - bsb_root = os.path.join( - os.path.abspath(__file__), os.pardir, os.pardir, os.pardir - ) + bsb_root = os.path.join(os.path.abspath(__file__), os.pardir, os.pardir, os.pardir) new_gemfile_path = os.path.join(bsb_root, "Gemfile") shutil.copyfile(local_gemfile_path, new_gemfile_path) logger.info(f"Copying custom Gemfile from {local_gemfile_path}") @@ -1212,9 +1099,7 @@ def build_image(self): # which stops before bundling custom gems into the image stage = "buildstockbatch" - logger.info( - f"Building docker image stage: {stage} from OpenStudio {self.os_version}" - ) + logger.info(f"Building docker image stage: {stage} from OpenStudio {self.os_version}") img, build_logs = self.docker_client.images.build( path=str(root_path), tag=self.docker_image, @@ -1266,22 +1151,16 @@ def push_image(self): """ auth_token = self.ecr.get_authorization_token() dkr_user, dkr_pass = ( - base64.b64decode(auth_token["authorizationData"][0]["authorizationToken"]) - .decode("ascii") - .split(":") + base64.b64decode(auth_token["authorizationData"][0]["authorizationToken"]).decode("ascii").split(":") ) repo_url = self.container_repo["repositoryUri"] registry_url = "https://" + repo_url.split("/")[0] - resp = self.docker_client.login( - username=dkr_user, password=dkr_pass, registry=registry_url - ) + resp = self.docker_client.login(username=dkr_user, password=dkr_pass, registry=registry_url) logger.debug(resp) image = self.docker_client.images.get(self.docker_image) image.tag(repo_url, tag=self.job_identifier) last_status = None - for x in self.docker_client.images.push( - repo_url, tag=self.job_identifier, stream=True - ): + for x in self.docker_client.images.push(repo_url, tag=self.job_identifier, stream=True): try: y = json.loads(x) except json.JSONDecodeError: @@ -1298,9 +1177,7 @@ def clean(self): """ logger.info("Beginning cleanup of AWS resources...") - batch_env = AwsBatchEnv( - self.job_identifier, self.cfg["aws"], self.boto3_session - ) + batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) batch_env.clean() def upload_batch_files_to_cloud(self, tmppath): @@ -1337,9 +1214,7 @@ def start_batch_job(self, batch_info): ) # Define the batch environment - batch_env = AwsBatchEnv( - self.job_identifier, self.cfg["aws"], self.boto3_session - ) + batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) logger.info( "Launching Batch environment - (resource configs will not be updated on subsequent executions, but new job revisions will be created):" # noqa 501 ) @@ -1371,18 +1246,14 @@ def start_batch_job(self, batch_info): # Monitor job status n_succeeded_last_time = 0 - with tqdm.tqdm( - desc="Running Simulations", total=self.batch_array_size - ) as progress_bar: + with tqdm.tqdm(desc="Running Simulations", total=self.batch_array_size) as progress_bar: job_status = None while job_status not in ("SUCCEEDED", "FAILED"): time.sleep(10) job_desc_resp = batch_env.batch.describe_jobs(jobs=[job_info["jobId"]]) job_status = job_desc_resp["jobs"][0]["status"] - jobs_resp = batch_env.batch.list_jobs( - arrayJobId=job_info["jobId"], jobStatus="SUCCEEDED" - ) + jobs_resp = batch_env.batch.list_jobs(arrayJobId=job_info["jobId"], jobStatus="SUCCEEDED") n_succeeded = len(jobs_resp["jobSummaryList"]) next_token = jobs_resp.get("nextToken") while next_token is not None: @@ -1431,9 +1302,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): jobs_file_path = sim_dir.parent / "jobs.tar.gz" s3.download_file(bucket, f"{prefix}/jobs.tar.gz", str(jobs_file_path)) with tarfile.open(jobs_file_path, "r") as tar_f: - jobs_d = json.load( - tar_f.extractfile(f"jobs/job{job_id:05d}.json"), encoding="utf-8" - ) + jobs_d = json.load(tar_f.extractfile(f"jobs/job{job_id:05d}.json"), encoding="utf-8") logger.debug("Number of simulations = {}".format(len(jobs_d["batch"]))) logger.debug("Getting weather files") @@ -1441,9 +1310,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): os.makedirs(weather_dir, exist_ok=True) # Make a lookup of which parameter points to the weather file from options_lookup.tsv - with open( - sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8" - ) as f: + with open(sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8") as f: tsv_reader = csv.reader(f, delimiter="\t") next(tsv_reader) # skip headers param_name = None @@ -1455,9 +1322,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): raise RuntimeError( f"The epw files are specified in options_lookup.tsv under more than one parameter type: {param_name}, {row[0]}" ) # noqa: E501 - epw_filename = ( - row[row_has_epw.index(True) + 2].split("=")[1].split("/")[-1] - ) + epw_filename = row[row_has_epw.index(True) + 2].split("=")[1].split("/")[-1] param_name = row[0] option_name = row[1] epws_by_option[option_name] = epw_filename @@ -1484,9 +1349,7 @@ def run_job(cls, job_id, bucket, prefix, job_name, region): logger.debug("Extracting {}".format(epw_filename)) f_out.write(gzip.decompress(f_gz.getvalue())) - cls.run_simulations( - cfg, job_id, jobs_d, sim_dir, S3FileSystem(), f"{bucket}/{prefix}" - ) + cls.run_simulations(cfg, job_id, jobs_d, sim_dir, S3FileSystem(), f"{bucket}/{prefix}") def get_fs(self): return S3FileSystem() @@ -1494,9 +1357,7 @@ def get_fs(self): def get_dask_client(self): dask_cfg = self.cfg["aws"]["dask"] - batch_env = AwsBatchEnv( - self.job_identifier, self.cfg["aws"], self.boto3_session - ) + batch_env = AwsBatchEnv(self.job_identifier, self.cfg["aws"], self.boto3_session) m = 1024 self.dask_cluster = FargateCluster( region_name=self.region, @@ -1530,9 +1391,7 @@ def process_results(self, *args, **kwargs): cfg = deepcopy(self.cfg) container_buildstock_dir = str(container_workpath / "buildstock") cfg["buildstock_directory"] = container_buildstock_dir - cfg["project_directory"] = str( - pathlib.Path(self.project_dir).relative_to(self.buildstock_dir) - ) + cfg["project_directory"] = str(pathlib.Path(self.project_dir).relative_to(self.buildstock_dir)) with open(tmppath / "project_config.yml", "w") as f: f.write(yaml.dump(cfg, Dumper=yaml.SafeDumper)) @@ -1556,10 +1415,7 @@ def process_results(self, *args, **kwargs): ["python3", "-m", "buildstockbatch.aws.aws", container_cfg_path], volumes={ tmpdir: {"bind": str(container_workpath), "mode": "rw"}, - self.buildstock_dir: { - "bind": container_buildstock_dir, - "mode": "ro", - }, + self.buildstock_dir: {"bind": container_buildstock_dir, "mode": "ro"}, }, environment=env, name="bsb_post", diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py index 787edbff..7ecbf097 100644 --- a/buildstockbatch/aws/awsbase.py +++ b/buildstockbatch/aws/awsbase.py @@ -67,9 +67,7 @@ def role_stitcher( p_counter = p_counter + 1 for managed_policy_arn in managed_policie_arns: - response = self.iam.attach_role_policy( - PolicyArn=managed_policy_arn, RoleName=role_name - ) + response = self.iam.attach_role_policy(PolicyArn=managed_policy_arn, RoleName=role_name) logger.info(f"Role {role_name} created") @@ -100,9 +98,7 @@ def delete_role(self, role_name): response = self.iam.list_attached_role_policies(RoleName=role_name) for policy in response["AttachedPolicies"]: - self.iam.detach_role_policy( - RoleName=role_name, PolicyArn=policy["PolicyArn"] - ) + self.iam.detach_role_policy(RoleName=role_name, PolicyArn=policy["PolicyArn"]) logger.info(f"Policies detached from role {role_name}.") @@ -120,17 +116,13 @@ def delete_instance_profile(self, instance_profile_name): logger.info(f"Instance profile {instance_profile_name} deleted.") except Exception as e: if "NoSuchEntity" in str(e): - logger.info( - f"Instance profile {instance_profile_name} missing, skipping..." - ) + logger.info(f"Instance profile {instance_profile_name} missing, skipping...") else: raise def remove_role_from_instance_profile(self, instance_profile_name): try: - response = self.iam.get_instance_profile( - InstanceProfileName=instance_profile_name - ) + response = self.iam.get_instance_profile(InstanceProfileName=instance_profile_name) for role in response["InstanceProfile"]["Roles"]: response = self.iam.remove_role_from_instance_profile( @@ -139,9 +131,7 @@ def remove_role_from_instance_profile(self, instance_profile_name): logger.info(f"Roles removed from instance profile {instance_profile_name}") except Exception as e: if "NoSuchEntity" in str(e): - logger.info( - f"Instance profile {instance_profile_name} does not exist. Skipping..." - ) + logger.info(f"Instance profile {instance_profile_name} does not exist. Skipping...") else: raise @@ -156,11 +146,7 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.iam = self.iam_helper.iam self.s3 = self.session.client("s3", config=boto_client_config) self.job_identifier = job_identifier - self.account = ( - self.session.client("sts", config=boto_client_config) - .get_caller_identity() - .get("Account") - ) + self.account = self.session.client("sts", config=boto_client_config).get_caller_identity().get("Account") self.region = aws_config["region"] self.operator_email = aws_config["notifications_email"] @@ -168,12 +154,8 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.s3_bucket = aws_config["s3"]["bucket"] self.s3_bucket_arn = f"arn:aws:s3:::{self.s3_bucket}" self.s3_bucket_prefix = aws_config["s3"]["prefix"].rstrip("/") - self.s3_lambda_code_emr_cluster_key = ( - f"{self.s3_bucket_prefix}/lambda_functions/emr_function.py.zip" - ) - self.s3_lambda_emr_config_key = ( - f"{self.s3_bucket_prefix}/lambda_functions/emr_config.json" - ) + self.s3_lambda_code_emr_cluster_key = f"{self.s3_bucket_prefix}/lambda_functions/emr_function.py.zip" + self.s3_lambda_emr_config_key = f"{self.s3_bucket_prefix}/lambda_functions/emr_config.json" self.s3_emr_folder_name = "emr" # Batch @@ -182,9 +164,7 @@ def __init__(self, job_identifier, aws_config, boto3_session): self.batch_job_queue_name = f"job_queue_{self.job_identifier}" self.batch_service_role_name = f"batch_service_role_{self.job_identifier}" self.batch_instance_role_name = f"batch_instance_role_{self.job_identifier}" - self.batch_instance_profile_name = ( - f"batch_instance_profile_{self.job_identifier}" - ) + self.batch_instance_profile_name = f"batch_instance_profile_{self.job_identifier}" self.batch_spot_service_role_name = f"spot_fleet_role_{self.job_identifier}" self.batch_ecs_task_role_name = f"ecs_task_role_{self.job_identifier}" self.batch_task_policy_name = f"ecs_task_policy_{self.job_identifier}" diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index edafa73f..b6ded0a6 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -62,37 +62,26 @@ def __init__(self, project_filename): self.buildstock_dir = self.cfg["buildstock_directory"] if not os.path.isdir(self.buildstock_dir): - raise FileNotFoundError( - f"buildstock_directory = {self.buildstock_dir} is not a directory." - ) - self.project_dir = os.path.join( - self.buildstock_dir, self.cfg["project_directory"] - ) + raise FileNotFoundError(f"buildstock_directory = {self.buildstock_dir} is not a directory.") + self.project_dir = os.path.join(self.buildstock_dir, self.cfg["project_directory"]) if not os.path.isdir(self.project_dir): - raise FileNotFoundError( - f"project_directory = {self.project_dir} is not a directory." - ) + raise FileNotFoundError(f"project_directory = {self.project_dir} is not a directory.") # Load in OS_VERSION and OS_SHA arguments if they exist in the YAML, # otherwise use defaults specified here. self.os_version = self.cfg.get("os_version", self.DEFAULT_OS_VERSION) self.os_sha = self.cfg.get("os_sha", self.DEFAULT_OS_SHA) - logger.debug( - f"Using OpenStudio version: {self.os_version} with SHA: {self.os_sha}" - ) + logger.debug(f"Using OpenStudio version: {self.os_version} with SHA: {self.os_sha}") @staticmethod def get_sampler_class(sampler_name): - sampler_class_name = ( - "".join(x.capitalize() for x in sampler_name.strip().split("_")) + "Sampler" - ) + sampler_class_name = "".join(x.capitalize() for x in sampler_name.strip().split("_")) + "Sampler" return getattr(sampler, sampler_class_name) @staticmethod def get_workflow_generator_class(workflow_generator_name): workflow_generator_class_name = ( - "".join(x.capitalize() for x in workflow_generator_name.strip().split("_")) - + "WorkflowGenerator" + "".join(x.capitalize() for x in workflow_generator_name.strip().split("_")) + "WorkflowGenerator" ) return getattr(workflow_generator, workflow_generator_class_name) @@ -125,9 +114,7 @@ def _get_weather_files(self): f.write(chunk) f.seek(0) with zipfile.ZipFile(f, "r") as zf: - logger.debug( - "Extracting weather files to: {}".format(self.weather_dir) - ) + logger.debug("Extracting weather files to: {}".format(self.weather_dir)) zf.extractall(self.weather_dir) @property @@ -149,12 +136,8 @@ def skip_baseline_sims(self): @classmethod def get_reporting_measures(cls, cfg): - WorkflowGenerator = cls.get_workflow_generator_class( - cfg["workflow_generator"]["type"] - ) - wg = WorkflowGenerator( - cfg, 1 - ) # Number of datapoints doesn't really matter here + WorkflowGenerator = cls.get_workflow_generator_class(cfg["workflow_generator"]["type"]) + wg = WorkflowGenerator(cfg, 1) # Number of datapoints doesn't really matter here return wg.reporting_measures() def run_batch(self): @@ -162,9 +145,7 @@ def run_batch(self): @classmethod def create_osw(cls, cfg, n_datapoints, *args, **kwargs): - WorkflowGenerator = cls.get_workflow_generator_class( - cfg["workflow_generator"]["type"] - ) + WorkflowGenerator = cls.get_workflow_generator_class(cfg["workflow_generator"]["type"]) osw_generator = WorkflowGenerator(cfg, n_datapoints) return osw_generator.create_osw(*args, **kwargs) @@ -187,9 +168,7 @@ def make_sim_dir(building_id, upgrade_idx, base_dir, overwrite_existing=False): sim_dir, ) elif os.path.exists(os.path.join(sim_dir, "run", "failed.job")): - raise SimulationExists( - "{} exists and failed".format(sim_id), sim_id, sim_dir - ) + raise SimulationExists("{} exists and failed".format(sim_id), sim_id, sim_dir) else: shutil.rmtree(sim_dir) @@ -235,21 +214,13 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id): if os.path.isfile(timeseries_filepath): # Find the time columns present in the enduse_timeseries file possible_time_cols = ["time", "Time", "TimeDST", "TimeUTC"] - cols = read_csv( - timeseries_filepath, index_col=False, nrows=0 - ).columns.tolist() + cols = read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist() actual_time_cols = [c for c in cols if c in possible_time_cols] if not actual_time_cols: - logger.error( - f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}." - ) - raise RuntimeError( - f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}." - ) + logger.error(f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.") + raise RuntimeError(f"Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.") - tsdf = read_csv( - timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows - ) + tsdf = read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows) for col in actual_time_cols: tsdf[col] = tsdf[col].astype(pd.ArrowDtype(pa.timestamp("s"))) if os.path.isfile(schedules_filepath): @@ -319,9 +290,7 @@ def get_buildstock_dir(project_file, cfg): if os.path.isabs(buildstock_dir): return os.path.abspath(buildstock_dir) else: - return os.path.abspath( - os.path.join(os.path.dirname(project_file), buildstock_dir) - ) + return os.path.abspath(os.path.join(os.path.dirname(project_file), buildstock_dir)) @classmethod def validate_openstudio_path(cls, project_file): @@ -337,14 +306,10 @@ def validate_openstudio_path(cls, project_file): except FileNotFoundError: raise ValidationError(f"Cannot find openstudio at `{cls.openstudio_exe()}`") if proc_out.returncode != 0: - raise ValidationError( - f"OpenStudio failed with the following error {proc_out.stderr}" - ) + raise ValidationError(f"OpenStudio failed with the following error {proc_out.stderr}") actual_os_version, actual_os_sha = proc_out.stdout.strip().split("+") if os_version != actual_os_version: - raise ValidationError( - f"OpenStudio version is {actual_os_version}, expected is {os_version}" - ) + raise ValidationError(f"OpenStudio version is {actual_os_version}, expected is {os_version}") if os_sha != actual_os_sha: raise ValidationError( f"OpenStudio version is correct at {os_version}, but the shas don't match. " @@ -369,9 +334,7 @@ def validate_sampler(project_file): else: sample_file = os.path.abspath(sample_file) buildstock_df = read_csv(sample_file, dtype=str) - return BuildStockBatchBase.validate_buildstock_csv( - project_file, buildstock_df - ) + return BuildStockBatchBase.validate_buildstock_csv(project_file, buildstock_df) return True @staticmethod @@ -384,9 +347,7 @@ def validate_buildstock_csv(project_file, buildstock_df): if column in {"Building"}: continue if column not in param_option_dict: - errors.append( - f"Column {column} in buildstock_csv is not available in options_lookup.tsv" - ) + errors.append(f"Column {column} in buildstock_csv is not available in options_lookup.tsv") continue if "*" in param_option_dict[column]: continue # skip validating options when wildcard is present @@ -404,22 +365,16 @@ def validate_buildstock_csv(project_file, buildstock_df): @classmethod def validate_workflow_generator(cls, project_file): cfg = get_project_configuration(project_file) - WorkflowGenerator = cls.get_workflow_generator_class( - cfg["workflow_generator"]["type"] - ) + WorkflowGenerator = cls.get_workflow_generator_class(cfg["workflow_generator"]["type"]) return WorkflowGenerator.validate(cfg) @staticmethod def validate_project_schema(project_file): cfg = get_project_configuration(project_file) schema_version = cfg.get("schema_version") - version_schema = os.path.join( - os.path.dirname(__file__), "schemas", f"v{schema_version}.yaml" - ) + version_schema = os.path.join(os.path.dirname(__file__), "schemas", f"v{schema_version}.yaml") if not os.path.isfile(version_schema): - logger.error( - f"Could not find validation schema for YAML version {schema_version}" - ) + logger.error(f"Could not find validation schema for YAML version {schema_version}") raise FileNotFoundError(version_schema) schema = yamale.make_schema(version_schema) data = yamale.make_data(project_file, parser="ruamel") @@ -439,9 +394,7 @@ def validate_postprocessing_spec(project_file): partition_cols = cfg.get("postprocessing", {}).get("partition_columns", []) invalid_cols = [c for c in partition_cols if c not in param_option_dict.keys()] if invalid_cols: - raise ValidationError( - f"The following partition columns are not valid: {invalid_cols}" - ) + raise ValidationError(f"The following partition columns are not valid: {invalid_cols}") return True @staticmethod @@ -451,12 +404,8 @@ def validate_xor_nor_schema_keys(project_file): if int(major) >= 0: if int(minor) >= 0: # xor - if ("weather_files_url" in cfg.keys()) is ( - "weather_files_path" in cfg.keys() - ): - raise ValidationError( - "Both/neither weather_files_url and weather_files_path found in yaml root" - ) + if ("weather_files_url" in cfg.keys()) is ("weather_files_path" in cfg.keys()): + raise ValidationError("Both/neither weather_files_url and weather_files_path found in yaml root") return True @@ -471,9 +420,7 @@ def get_param_option_dict(project_file): try: with open(options_lookup_path, "r") as f: options = csv.DictReader(f, delimiter="\t") - invalid_options_lookup_str = ( - "" # Holds option/parameter names with invalid characters - ) + invalid_options_lookup_str = "" # Holds option/parameter names with invalid characters for row in options: for col in ["Parameter Name", "Option Name"]: invalid_chars = set(row[col]).intersection(set("|&()")) @@ -483,16 +430,9 @@ def get_param_option_dict(project_file): param_name, opt_name = row["Parameter Name"], row["Option Name"] param_option_dict[row["Parameter Name"]].add(row["Option Name"]) if opt_name == "*" and row["Measure Dir"]: - invalid_options_lookup_str += ( - f"{param_name}: '*' cannot pass arguments to measure.\n" - ) - if ( - "*" in param_option_dict[param_name] - and len(param_option_dict[param_name]) > 1 - ): - invalid_options_lookup_str += ( - f"{param_name}: '*' cannot be mixed with other options\n" - ) + invalid_options_lookup_str += f"{param_name}: '*' cannot pass arguments to measure.\n" + if "*" in param_option_dict[param_name] and len(param_option_dict[param_name]) > 1: + invalid_options_lookup_str += f"{param_name}: '*' cannot be mixed with other options\n" except FileNotFoundError as err: logger.error(f"Options lookup file not found at: '{options_lookup_path}'") raise err @@ -522,9 +462,7 @@ def get_errors(source_str, option_str): if not returns error message, close matches, and specifies where the error occurred (source_str) """ if "||" in option_str and "&&" in option_str: - invalid_option_spec_counter[ - (option_str, "has both || and && (not supported)") - ] += 1 + invalid_option_spec_counter[(option_str, "has both || and && (not supported)")] += 1 return "" if "||" in option_str or "&&" in option_str: @@ -532,9 +470,7 @@ def get_errors(source_str, option_str): errors = "" broken_options = option_str.split(splitter) if broken_options[-1] == "": - invalid_option_spec_counter[ - (option_str, "has trailing 'splitter'") - ] += 1 + invalid_option_spec_counter[(option_str, "has trailing 'splitter'")] += 1 return "" for broken_option_str in broken_options: new_source_str = source_str + f" in composite option '{option_str}'" @@ -556,21 +492,15 @@ def get_errors(source_str, option_str): return "" if parameter_name not in param_option_dict: - close_match = difflib.get_close_matches( - parameter_name, param_option_dict.keys(), 1 - ) + close_match = difflib.get_close_matches(parameter_name, param_option_dict.keys(), 1) close_match = close_match[0] if close_match else "" invalid_param_counter[(parameter_name, close_match)] += 1 return "" if not option_name or option_name not in param_option_dict[parameter_name]: - close_match = difflib.get_close_matches( - option_name, list(param_option_dict[parameter_name]), 1 - ) + close_match = difflib.get_close_matches(option_name, list(param_option_dict[parameter_name]), 1) close_match = close_match[0] if close_match else "" - invalid_option_counter_dict[parameter_name][ - (option_name, close_match) - ] += 1 + invalid_option_counter_dict[parameter_name][(option_name, close_match)] += 1 return "" return "" @@ -590,62 +520,38 @@ def get_all_option_str(source_str, inp): return [(source_str, inp)] elif type(inp) == list: return sum( - [ - get_all_option_str(source_str + f", in entry {count}", entry) - for count, entry in enumerate(inp) - ], + [get_all_option_str(source_str + f", in entry {count}", entry) for count, entry in enumerate(inp)], [], ) elif type(inp) == dict: if len(inp) > 1: - raise ValidationError( - f"{source_str} the logic is malformed. Dict can't have more than one entry" - ) + raise ValidationError(f"{source_str} the logic is malformed. Dict can't have more than one entry") source_str += f", in {list(inp.keys())[0]}" - return sum( - [get_all_option_str(source_str, i) for i in inp.values()], [] - ) + return sum([get_all_option_str(source_str, i) for i in inp.values()], []) # store all of the option_str in the project file as a list of (source_str, option_str) tuple source_option_str_list = [] if "upgrades" in cfg: for upgrade_count, upgrade in enumerate(cfg["upgrades"]): - upgrade_name = ( - upgrade.get("upgrade_name", "") - + f" (Upgrade Number: {upgrade_count})" - ) + upgrade_name = upgrade.get("upgrade_name", "") + f" (Upgrade Number: {upgrade_count})" source_str_upgrade = f"In upgrade '{upgrade_name}'" for option_count, option in enumerate(upgrade["options"]): - option_name = ( - option.get("option", "") + f" (Option Number: {option_count})" - ) - source_str_option = ( - source_str_upgrade + f", in option '{option_name}'" - ) - source_option_str_list.append( - (source_str_option, option.get("option")) - ) + option_name = option.get("option", "") + f" (Option Number: {option_count})" + source_str_option = source_str_upgrade + f", in option '{option_name}'" + source_option_str_list.append((source_str_option, option.get("option"))) if "apply_logic" in option: source_str_logic = source_str_option + ", in apply_logic" - source_option_str_list += get_all_option_str( - source_str_logic, option["apply_logic"] - ) + source_option_str_list += get_all_option_str(source_str_logic, option["apply_logic"]) if "package_apply_logic" in upgrade: source_str_package = source_str_upgrade + ", in package_apply_logic" - source_option_str_list += get_all_option_str( - source_str_package, upgrade["package_apply_logic"] - ) + source_option_str_list += get_all_option_str(source_str_package, upgrade["package_apply_logic"]) # TODO: refactor this into Sampler.validate_args if "downselect" in cfg or "downselect" in cfg.get("sampler", {}).get("type"): source_str = "In downselect" - logic = ( - cfg["downselect"]["logic"] - if "downselect" in cfg - else cfg["sampler"]["args"]["logic"] - ) + logic = cfg["downselect"]["logic"] if "downselect" in cfg else cfg["sampler"]["args"]["logic"] source_option_str_list += get_all_option_str(source_str, logic) # Gather all the errors in the option_str, if any @@ -654,11 +560,7 @@ def get_all_option_str(source_str, inp): error_message += get_errors(source_str, option_str) if error_message: - error_message = ( - "Following option/parameter entries have problem:\n" - + error_message - + "\n" - ) + error_message = "Following option/parameter entries have problem:\n" + error_message + "\n" if invalid_option_spec_counter: error_message += "* Following option/parameter entries have problem:\n" @@ -666,9 +568,7 @@ def get_all_option_str(source_str, inp): error_message += f" '{invalid_entry}' {error} - used '{count}' times\n" if invalid_param_counter: - error_message += ( - "* Following parameters do not exist in options_lookup.tsv\n" - ) + error_message += "* Following parameters do not exist in options_lookup.tsv\n" for (param, close_match), count in invalid_param_counter.items(): error_message += f" '{param}' - used '{count}' times." if close_match: @@ -740,9 +640,7 @@ def get_logic_problems(logic, parent=None): assert len(logic) == 1 for key, val in logic.items(): if key not in ["or", "and", "not"]: - raise ValidationError( - f"Invalid key {key}. Only 'or', 'and' and 'not' is allowed." - ) + raise ValidationError(f"Invalid key {key}. Only 'or', 'and' and 'not' is allowed.") return get_logic_problems(val, parent=key) elif isinstance(logic, str): if "&&" not in logic: @@ -750,28 +648,19 @@ def get_logic_problems(logic, parent=None): entries = logic.split("&&") return get_logic_problems(entries, parent="&&") else: - raise ValidationError( - f"Invalid logic element {logic} with type {type(logic)}" - ) + raise ValidationError(f"Invalid logic element {logic} with type {type(logic)}") all_problems = [] if "upgrades" in cfg: for upgrade_count, upgrade in enumerate(cfg["upgrades"]): upgrade_name = upgrade.get("upgrade_name", "") - source_str_upgrade = ( - f"upgrade '{upgrade_name}' (Upgrade Number:{upgrade_count})" - ) + source_str_upgrade = f"upgrade '{upgrade_name}' (Upgrade Number:{upgrade_count})" for option_count, option in enumerate(upgrade["options"]): option_name = option.get("option", "") - source_str_option = ( - source_str_upgrade - + f", option '{option_name}' (Option Number:{option_count})" - ) + source_str_option = source_str_upgrade + f", option '{option_name}' (Option Number:{option_count})" if "apply_logic" in option: if problems := get_logic_problems(option["apply_logic"]): - all_problems.append( - (source_str_option, problems, option["apply_logic"]) - ) + all_problems.append((source_str_option, problems, option["apply_logic"])) if "package_apply_logic" in upgrade: source_str_package = source_str_upgrade + ", in package_apply_logic" @@ -787,11 +676,7 @@ def get_logic_problems(logic, parent=None): # TODO: refactor this into Sampler.validate_args if "downselect" in cfg or "downselect" in cfg.get("sampler", {}).get("type"): source_str = "in downselect logic" - logic = ( - cfg["downselect"]["logic"] - if "downselect" in cfg - else cfg["sampler"]["args"]["logic"] - ) + logic = cfg["downselect"]["logic"] if "downselect" in cfg else cfg["sampler"]["args"]["logic"] if problems := get_logic_problems(logic): all_problems.append((source_str, problems, logic)) @@ -839,10 +724,7 @@ def get_errors(source_str, measure_str): """ if measure_str not in measure_dirs: closest = difflib.get_close_matches(measure_str, list(measure_dirs)) - return ( - f"Measure directory {measure_str} not found. Closest matches: {closest}" - f" {source_str}\n" - ) + return f"Measure directory {measure_str} not found. Closest matches: {closest}" f" {source_str}\n" return "" source_measures_str_list = [] @@ -859,9 +741,7 @@ def get_errors(source_str, measure_str): if not error_message: return True else: - error_message = ( - "Measure name(s)/directory(ies) is(are) invalid. \n" + error_message - ) + error_message = "Measure name(s)/directory(ies) is(are) invalid. \n" + error_message logger.error(error_message) raise ValidationError(error_message) @@ -904,9 +784,7 @@ def validate_resstock_or_comstock_version(project_file): """ cfg = get_project_configuration(project_file) - buildstock_rb = os.path.join( - cfg["buildstock_directory"], "resources/buildstock.rb" - ) + buildstock_rb = os.path.join(cfg["buildstock_directory"], "resources/buildstock.rb") if os.path.exists(buildstock_rb): with open(buildstock_rb, "r") as f: versions = dict( @@ -943,9 +821,7 @@ def validate_number_of_options(project_file): :rtype: bool """ cfg = get_project_configuration(project_file) - measure_xml_filename = os.path.join( - cfg["buildstock_directory"], "measures", "ApplyUpgrade", "measure.xml" - ) + measure_xml_filename = os.path.join(cfg["buildstock_directory"], "measures", "ApplyUpgrade", "measure.xml") if os.path.exists(measure_xml_filename): measure_xml_tree = objectify.parse(measure_xml_filename) measure_xml = measure_xml_tree.getroot() @@ -956,14 +832,10 @@ def validate_number_of_options(project_file): if m_option: option_number = int(m_option.group(1)) n_options_in_measure = max(option_number, n_options_in_measure) - m_costs = re.match( - r"^option_(\d+)_cost_(\d+)_value", str(argument.name) - ) + m_costs = re.match(r"^option_(\d+)_cost_(\d+)_value", str(argument.name)) if m_costs: cost_number = int(m_costs.group(2)) - n_costs_per_option_in_measure = max( - cost_number, n_costs_per_option_in_measure - ) + n_costs_per_option_in_measure = max(cost_number, n_costs_per_option_in_measure) n_options_in_cfg = 0 n_costs_in_cfg = 0 for upgrade in cfg.get("upgrades", []): @@ -1042,22 +914,14 @@ def process_results(self, skip_combine=False, use_dask_cluster=True): wfg_args = self.cfg["workflow_generator"].get("args", {}) if self.cfg["workflow_generator"]["type"] == "residential_hpxml": if "simulation_output_report" in wfg_args.keys(): - if ( - "timeseries_frequency" - in wfg_args["simulation_output_report"].keys() - ): - do_timeseries = ( - wfg_args["simulation_output_report"]["timeseries_frequency"] - != "none" - ) + if "timeseries_frequency" in wfg_args["simulation_output_report"].keys(): + do_timeseries = wfg_args["simulation_output_report"]["timeseries_frequency"] != "none" else: do_timeseries = "timeseries_csv_export" in wfg_args.keys() fs = self.get_fs() if not skip_combine: - postprocessing.combine_results( - fs, self.results_dir, self.cfg, do_timeseries=do_timeseries - ) + postprocessing.combine_results(fs, self.results_dir, self.cfg, do_timeseries=do_timeseries) aws_conf = self.cfg.get("postprocessing", {}).get("aws", {}) if "s3" in aws_conf or "aws" in self.cfg: diff --git a/buildstockbatch/cloud/docker_base.py b/buildstockbatch/cloud/docker_base.py index 613978a4..3176a23a 100644 --- a/buildstockbatch/cloud/docker_base.py +++ b/buildstockbatch/cloud/docker_base.py @@ -30,13 +30,7 @@ from buildstockbatch import postprocessing from buildstockbatch.base import BuildStockBatchBase -from buildstockbatch.utils import ( - ContainerRuntime, - calc_hash_for_file, - compress_file, - read_csv, - get_bool_env_var, -) +from buildstockbatch.utils import ContainerRuntime, calc_hash_for_file, compress_file, read_csv, get_bool_env_var logger = logging.getLogger(__name__) @@ -70,12 +64,8 @@ def __init__(self, project_filename): try: self.docker_client.ping() except: # noqa: E722 (allow bare except in this case because error can be a weird non-class Windows API error) - logger.error( - "The docker server did not respond, make sure Docker Desktop is started then retry." - ) - raise RuntimeError( - "The docker server did not respond, make sure Docker Desktop is started then retry." - ) + logger.error("The docker server did not respond, make sure Docker Desktop is started then retry.") + raise RuntimeError("The docker server did not respond, make sure Docker Desktop is started then retry.") @staticmethod def validate_project(project_file): @@ -202,14 +192,10 @@ def _prep_weather_files_for_batch(self, tmppath): self._get_weather_files() # Determine the unique weather files - epw_filenames = list( - filter(lambda x: x.endswith(".epw"), os.listdir(self.weather_dir)) - ) + epw_filenames = list(filter(lambda x: x.endswith(".epw"), os.listdir(self.weather_dir))) logger.info("Calculating hashes for weather files") epw_hashes = Parallel(n_jobs=-1, verbose=9)( - delayed(calc_hash_for_file)( - pathlib.Path(self.weather_dir) / epw_filename - ) + delayed(calc_hash_for_file)(pathlib.Path(self.weather_dir) / epw_filename) for epw_filename in epw_filenames ) # keep track of unique EPWs that may have dupes, and to compress and upload to cloud @@ -219,9 +205,7 @@ def _prep_weather_files_for_batch(self, tmppath): for epw_filename, epw_hash in zip(epw_filenames, epw_hashes): if bool(unique_epws[epw_hash]): # not the first file with this hash (it's a duplicate). add to ``epws_to_copy`` - epws_to_copy.append( - (unique_epws[epw_hash][0] + ".gz", epw_filename + ".gz") - ) + epws_to_copy.append((unique_epws[epw_hash][0] + ".gz", epw_filename + ".gz")) unique_epws[epw_hash].append(epw_filename) # Compress unique weather files and save to ``tmp_weather_out_path``, which will get @@ -246,10 +230,7 @@ def _prep_weather_files_for_batch(self, tmppath): total_count += count if count > 1: dupe_count += count - 1 - bytes = ( - os.path.getsize(str(tmp_weather_out_path / epws[0]) + ".gz") - * dupe_count - ) + bytes = os.path.getsize(str(tmp_weather_out_path / epws[0]) + ".gz") * dupe_count dupe_bytes = bytes * (count - 1) logger.info( f"Identified {dupe_count:,} duplicate weather files " @@ -276,9 +257,7 @@ def _prep_jobs_for_batch(self, tmppath): # Create list of (building ID, upgrade to apply) pairs for all simulations to run. baseline_sims = zip(building_ids, itertools.repeat(None)) - upgrade_sims = itertools.product( - building_ids, range(len(self.cfg.get("upgrades", []))) - ) + upgrade_sims = itertools.product(building_ids, range(len(self.cfg.get("upgrades", [])))) all_sims = list(itertools.chain(baseline_sims, upgrade_sims)) random.shuffle(all_sims) all_sims_iter = iter(all_sims) @@ -334,9 +313,7 @@ def _prep_jobs_for_batch(self, tmppath): "lib/housing_characteristics", ) - return DockerBatchBase.BatchInfo( - n_sims=n_sims, n_sims_per_job=n_sims_per_job, job_count=job_count - ) + return DockerBatchBase.BatchInfo(n_sims=n_sims, n_sims_per_job=n_sims_per_job, job_count=job_count) @classmethod def get_epws_to_download(cls, sim_dir, jobs_d): @@ -349,9 +326,7 @@ def get_epws_to_download(cls, sim_dir, jobs_d): :returns: Set of epw filenames needed for this batch of simulations. """ # Make a lookup of which parameter points to the weather file from options_lookup.tsv - with open( - sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8" - ) as f: + with open(sim_dir / "lib" / "resources" / "options_lookup.tsv", "r", encoding="utf-8") as f: tsv_reader = csv.reader(f, delimiter="\t") next(tsv_reader) # skip headers param_name = None @@ -411,9 +386,7 @@ def run_simulations(cls, cfg, job_id, jobs_d, sim_dir, fs, output_path): sim_id = f"bldg{building_id:07d}up{upgrade_id:02d}" # Create OSW - osw = cls.create_osw( - cfg, jobs_d["n_datapoints"], sim_id, building_id, upgrade_idx - ) + osw = cls.create_osw(cfg, jobs_d["n_datapoints"], sim_id, building_id, upgrade_idx) with open(os.path.join(sim_dir, "in.osw"), "w") as f: json.dump(osw, f, indent=4) diff --git a/buildstockbatch/hpc.py b/buildstockbatch/hpc.py index 16efff9b..ade93702 100644 --- a/buildstockbatch/hpc.py +++ b/buildstockbatch/hpc.py @@ -72,9 +72,7 @@ def __init__(self, project_filename): logger.debug("Output directory = {}".format(output_dir)) weather_dir = self.weather_dir # noqa E841 - self.apptainer_image = self.get_apptainer_image( - self.cfg, self.os_version, self.os_sha - ) + self.apptainer_image = self.get_apptainer_image(self.cfg, self.os_version, self.os_sha) @classmethod def validate_project(cls, project_file): @@ -97,9 +95,7 @@ def validate_apptainer_image_hpc(cls, project_file): @property def output_dir(self): - output_dir = path_rel_to_file( - self.project_filename, self.cfg["output_directory"] - ) + output_dir = path_rel_to_file(self.project_filename, self.cfg["output_directory"]) return output_dir @property @@ -118,16 +114,11 @@ def clear_and_copy_dir(src, dst): def get_apptainer_image(cls, cfg, os_version, os_sha): exts_to_try = ["Apptainer.sif", "Singularity.simg"] sys_img_dir = cfg.get("sys_image_dir", cls.DEFAULT_SYS_IMAGE_DIR) - image_paths = [ - pathlib.Path(sys_img_dir, f"OpenStudio-{os_version}.{os_sha}-{ext}") - for ext in exts_to_try - ] + image_paths = [pathlib.Path(sys_img_dir, f"OpenStudio-{os_version}.{os_sha}-{ext}") for ext in exts_to_try] for image_path in image_paths: if image_path.exists(): return str(image_path) - raise RuntimeError( - f"Could not find apptainer image: {' or '.join(map(str, image_paths))}" - ) + raise RuntimeError(f"Could not find apptainer image: {' or '.join(map(str, image_paths))}") @property def weather_dir(self): @@ -139,12 +130,7 @@ def weather_dir(self): def run_batch(self, sampling_only=False): # Create simulation_output dir - sim_out_ts_dir = ( - pathlib.Path(self.output_dir) - / "results" - / "simulation_output" - / "timeseries" - ) + sim_out_ts_dir = pathlib.Path(self.output_dir) / "results" / "simulation_output" / "timeseries" os.makedirs(sim_out_ts_dir, exist_ok=True) for i in range(0, len(self.cfg.get("upgrades", [])) + 1): os.makedirs(sim_out_ts_dir / f"up{i:02d}") @@ -154,9 +140,7 @@ def run_batch(self, sampling_only=False): destination_dir = os.path.dirname(self.sampler.csv_path) if os.path.exists(destination_dir): shutil.rmtree(destination_dir) - shutil.copytree( - os.path.join(self.project_dir, "housing_characteristics"), destination_dir - ) + shutil.copytree(os.path.join(self.project_dir, "housing_characteristics"), destination_dir) logger.debug("Housing characteristics copied.") # run sampling @@ -186,9 +170,7 @@ def run_batch(self, sampling_only=False): # larger than we need, now that we know n_sims n_sims_per_job = max(n_sims_per_job, self.MIN_SIMS_PER_JOB) - upgrade_sims = itertools.product( - building_ids, range(len(self.cfg.get("upgrades", []))) - ) + upgrade_sims = itertools.product(building_ids, range(len(self.cfg.get("upgrades", [])))) if not self.skip_baseline_sims: # create batches of simulations baseline_sims = zip(building_ids, itertools.repeat(None)) @@ -203,9 +185,7 @@ def run_batch(self, sampling_only=False): if not batch: break logger.info("Queueing job {} ({} simulations)".format(i, len(batch))) - job_json_filename = os.path.join( - self.output_dir, "job{:03d}.json".format(i) - ) + job_json_filename = os.path.join(self.output_dir, "job{:03d}.json".format(i)) with open(job_json_filename, "w") as f: json.dump( { @@ -233,9 +213,7 @@ def run_job_batch(self, job_array_number): pathlib.Path(self.buildstock_dir) / "measures", self.local_buildstock_dir / "measures", ) - if os.path.exists( - pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures" - ): + if os.path.exists(pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures"): self.clear_and_copy_dir( pathlib.Path(self.buildstock_dir) / "resources/hpxml-measures", self.local_buildstock_dir / "resources/hpxml-measures", @@ -250,9 +228,7 @@ def run_job_batch(self, job_array_number): shutil.copy2(self.apptainer_image, self.local_apptainer_img) # Run the job batch as normal - job_json_filename = os.path.join( - self.output_dir, "job{:03d}.json".format(job_array_number) - ) + job_json_filename = os.path.join(self.output_dir, "job{:03d}.json".format(job_array_number)) with open(job_json_filename, "r") as f: args = json.load(f) @@ -270,18 +246,12 @@ def run_job_batch(self, job_array_number): df.to_csv(buildstock_csv_path, index=False) logger.debug(f"Buildstock.csv trimmed to {len(df)} rows.") - traceback_file_path = ( - self.local_output_dir - / "simulation_output" - / f"traceback{job_array_number}.out" - ) + traceback_file_path = self.local_output_dir / "simulation_output" / f"traceback{job_array_number}.out" @delayed def run_building_d(i, upgrade_idx): try: - return self.run_building( - self.output_dir, self.cfg, args["n_datapoints"], i, upgrade_idx - ) + return self.run_building(self.output_dir, self.cfg, args["n_datapoints"], i, upgrade_idx) except Exception: with open(traceback_file_path, "a") as f: txt = get_error_details() @@ -308,9 +278,7 @@ def run_building_d(i, upgrade_idx): # Compress simulation results if self.cfg.get("max_minutes_per_sim") is not None: time.sleep(60) # Allow results JSON to finish writing - simout_filename = ( - lustre_sim_out_dir / f"simulations_job{job_array_number}.tar.gz" - ) + simout_filename = lustre_sim_out_dir / f"simulations_job{job_array_number}.tar.gz" logger.info(f"Compressing simulation outputs to {simout_filename}") local_sim_out_dir = self.local_output_dir / "simulation_output" subprocess.run( @@ -357,23 +325,17 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1 try: - sim_id, sim_dir = cls.make_sim_dir( - i, upgrade_idx, os.path.join(cls.local_output_dir, "simulation_output") - ) + sim_id, sim_dir = cls.make_sim_dir(i, upgrade_idx, os.path.join(cls.local_output_dir, "simulation_output")) except SimulationExists as ex: sim_dir = ex.sim_dir else: # Generate the osw for this simulation - osw = cls.create_osw( - cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx - ) + osw = cls.create_osw(cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx) with open(os.path.join(sim_dir, "in.osw"), "w") as f: json.dump(osw, f, indent=4) # Create a temporary directory for the simulation to use - with tempfile.TemporaryDirectory( - dir=cls.local_scratch, prefix=f"{sim_id}_" - ) as tmpdir: + with tempfile.TemporaryDirectory(dir=cls.local_scratch, prefix=f"{sim_id}_") as tmpdir: # Build the command to instantiate and configure the apptainer container the simulation is run inside local_resources_dir = cls.local_buildstock_dir / "resources" args = [ @@ -403,19 +365,11 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): for src in dirs_to_mount: container_mount = "/" + src.name args.extend(["-B", "{}:{}:ro".format(src, container_mount)]) - container_symlink = pathlib.Path( - "/var/simdata/openstudio", src.name - ) - runscript.append( - "ln -s {} {}".format( - *map(shlex.quote, (container_mount, str(container_symlink))) - ) - ) + container_symlink = pathlib.Path("/var/simdata/openstudio", src.name) + runscript.append("ln -s {} {}".format(*map(shlex.quote, (container_mount, str(container_symlink))))) if (cls.local_buildstock_dir / "resources" / "hpxml-measures").exists(): - runscript.append( - "ln -s /resources /var/simdata/openstudio/resources" - ) + runscript.append("ln -s /resources /var/simdata/openstudio/resources") src = cls.local_buildstock_dir / "resources" / "hpxml-measures" container_mount = "/resources/hpxml-measures" args.extend(["-B", f"{src}:{container_mount}:ro"]) @@ -468,18 +422,10 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): "timeout": msg, } out_osw.write(json.dumps(out_msg, indent=3)) - with open( - pathlib.Path(sim_dir, "run", "out.osw"), "a" - ) as run_log: - run_log.write( - f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}" - ) - with open( - pathlib.Path(sim_dir, "run", "failed.job"), "w" - ) as failed_job: - failed_job.write( - f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}" - ) + with open(pathlib.Path(sim_dir, "run", "out.osw"), "a") as run_log: + run_log.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") + with open(pathlib.Path(sim_dir, "run", "failed.job"), "w") as failed_job: + failed_job.write(f"[{end_time.strftime('%H:%M:%S')} ERROR] {msg}") # Wait for EnergyPlus to release file locks and data_point.zip to finish time.sleep(60) except subprocess.CalledProcessError: @@ -488,9 +434,7 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): # Clean up the symbolic links we created in the container for mount_dir in dirs_to_mount + [pathlib.Path(sim_dir, "lib")]: try: - pathlib.Path( - sim_dir, os.path.basename(mount_dir) - ).unlink() + pathlib.Path(sim_dir, os.path.basename(mount_dir)).unlink() except FileNotFoundError: pass @@ -504,9 +448,7 @@ def run_building(cls, output_dir, cfg, n_datapoints, i, upgrade_idx=None): ) reporting_measures = cls.get_reporting_measures(cfg) - dpout = postprocessing.read_simulation_outputs( - fs, reporting_measures, sim_dir, upgrade_id, i - ) + dpout = postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, i) return dpout @staticmethod @@ -522,16 +464,12 @@ def queue_sampling( hipri: bool, ): cfg = get_project_configuration(project_filename) - hpc_sh = os.path.join( - os.path.dirname(os.path.abspath(__file__)), f"{cls.HPC_NAME}.sh" - ) + hpc_sh = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"{cls.HPC_NAME}.sh") assert os.path.exists(hpc_sh) out_dir = cfg["output_directory"] if os.path.exists(out_dir): raise FileExistsError( - "The output directory {} already exists. Please delete it or choose another.".format( - out_dir - ) + "The output directory {} already exists. Please delete it or choose another.".format(out_dir) ) logger.info("Creating output directory {}".format(out_dir)) os.makedirs(out_dir) @@ -582,9 +520,7 @@ def queue_jobs(self, array_ids=None, hipri=False): # Estimate the wall time in minutes minutes_per_sim = hpc_cfg["minutes_per_sim"] - walltime = math.ceil( - math.ceil(n_sims_per_job / self.CORES_PER_NODE) * minutes_per_sim - ) + walltime = math.ceil(math.ceil(n_sims_per_job / self.CORES_PER_NODE) * minutes_per_sim) # Queue up simulations here = os.path.dirname(os.path.abspath(__file__)) @@ -639,18 +575,10 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False) hpc_cfg = self.cfg[self.HPC_NAME] account = hpc_cfg["account"] walltime = hpc_cfg.get("postprocessing", {}).get("time", "1:30:00") - memory = hpc_cfg.get("postprocessing", {}).get( - "node_memory_mb", self.DEFAULT_POSTPROCESSING_NODE_MEMORY_MB - ) - n_procs = hpc_cfg.get("postprocessing", {}).get( - "n_procs", self.DEFAULT_POSTPROCESSING_N_PROCS - ) - n_workers = hpc_cfg.get("postprocessing", {}).get( - "n_workers", self.DEFAULT_POSTPROCESSING_N_WORKERS - ) - print( - f"Submitting job to {n_workers} {memory}MB memory nodes using {n_procs} cores in each." - ) + memory = hpc_cfg.get("postprocessing", {}).get("node_memory_mb", self.DEFAULT_POSTPROCESSING_NODE_MEMORY_MB) + n_procs = hpc_cfg.get("postprocessing", {}).get("n_procs", self.DEFAULT_POSTPROCESSING_N_PROCS) + n_workers = hpc_cfg.get("postprocessing", {}).get("n_workers", self.DEFAULT_POSTPROCESSING_N_WORKERS) + print(f"Submitting job to {n_workers} {memory}MB memory nodes using {n_procs} cores in each.") # Throw an error if the files already exist. if not upload_only: @@ -673,8 +601,7 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False) last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath)) shutil.move( filepath, - filepath.parent - / f"{filepath.stem}_{last_mod_date:%Y%m%d%H%M}{filepath.suffix}", + filepath.parent / f"{filepath.stem}_{last_mod_date:%Y%m%d%H%M}{filepath.suffix}", ) env_export = { @@ -730,22 +657,14 @@ def get_dask_client(self): # from dask.distributed import LocalCluster # cluster = LocalCluster(local_directory="/tmp/scratch/dask", n_workers=90, memory_limit="16GiB") # return Client(cluster) - return Client( - scheduler_file=os.path.join(self.output_dir, "dask_scheduler.json") - ) + return Client(scheduler_file=os.path.join(self.output_dir, "dask_scheduler.json")) def process_results(self, *args, **kwargs): # Check that all the jobs succeeded before proceeding failed_job_array_ids = self.get_failed_job_array_ids() if failed_job_array_ids: - logger.error( - "The following simulation jobs failed: {}".format( - ", ".join(map(str, failed_job_array_ids)) - ) - ) - logger.error( - "Please inspect those jobs and fix any problems before resubmitting." - ) + logger.error("The following simulation jobs failed: {}".format(", ".join(map(str, failed_job_array_ids)))) + logger.error("Please inspect those jobs and fix any problems before resubmitting.") logger.critical("Postprocessing cancelled.") return False @@ -797,8 +716,7 @@ def rerun_failed_jobs(self, hipri=False): last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath)) shutil.move( filepath, - prev_failed_job_out_dir - / f"{filepath.name}_{last_mod_date:%Y%m%d%H%M}", + prev_failed_job_out_dir / f"{filepath.name}_{last_mod_date:%Y%m%d%H%M}", ) # Delete simulation results for jobs we're about to rerun @@ -834,8 +752,7 @@ def validate_output_directory_eagle(cls, project_file): output_dir = path_rel_to_file(project_file, cfg["output_directory"]) if not re.match(r"/(lustre/eaglefs/)?(scratch|projects)", output_dir): raise ValidationError( - f"`output_directory` must be in /scratch or /projects," - f" `output_directory` = {output_dir}" + f"`output_directory` must be in /scratch or /projects," f" `output_directory` = {output_dir}" ) @classmethod @@ -866,8 +783,7 @@ def validate_output_directory_kestrel(cls, project_file): output_dir = path_rel_to_file(project_file, cfg["output_directory"]) if not re.match(r"/(kfs\d/)?(scratch|projects)", output_dir): raise ValidationError( - f"`output_directory` must be in /scratch or /projects," - f" `output_directory` = {output_dir}" + f"`output_directory` must be in /scratch or /projects," f" `output_directory` = {output_dir}" ) @classmethod @@ -959,21 +875,15 @@ def user_cli(Batch: SlurmBatch, argv: list): help="Only validate the project YAML file and references. Nothing is executed", action="store_true", ) - group.add_argument( - "--samplingonly", help="Run the sampling only.", action="store_true" - ) - group.add_argument( - "--rerun_failed", help="Rerun the failed jobs", action="store_true" - ) + group.add_argument("--samplingonly", help="Run the sampling only.", action="store_true") + group.add_argument("--rerun_failed", help="Rerun the failed jobs", action="store_true") # parse CLI arguments args = parser.parse_args(argv) # load the yaml project file if not os.path.isfile(args.project_filename): - raise FileNotFoundError( - "The project file {} doesn't exist".format(args.project_filename) - ) + raise FileNotFoundError("The project file {} doesn't exist".format(args.project_filename)) project_filename = os.path.abspath(args.project_filename) # validate the project, and in case of the --validateonly flag return True if validation passes @@ -994,9 +904,7 @@ def user_cli(Batch: SlurmBatch, argv: list): # otherwise, queue up the whole buildstockbatch process # the main work of the first job is to run the sampling script ... - Batch.queue_sampling( - project_filename, args.measuresonly, args.samplingonly, args.hipri - ) + Batch.queue_sampling(project_filename, args.measuresonly, args.samplingonly, args.hipri) @log_error_details() diff --git a/buildstockbatch/local.py b/buildstockbatch/local.py index 70a72025..1a47f36b 100644 --- a/buildstockbatch/local.py +++ b/buildstockbatch/local.py @@ -46,9 +46,7 @@ def __init__(self, project_filename): self._weather_dir = None # Create simulation_output dir - sim_out_ts_dir = os.path.join( - self.results_dir, "simulation_output", "timeseries" - ) + sim_out_ts_dir = os.path.join(self.results_dir, "simulation_output", "timeseries") os.makedirs(sim_out_ts_dir, exist_ok=True) for i in range(0, len(self.cfg.get("upgrades", [])) + 1): os.makedirs(os.path.join(sim_out_ts_dir, f"up{i:02d}"), exist_ok=True) @@ -57,26 +55,18 @@ def __init__(self, project_filename): # FIXME: Get working without docker if self.cfg.get("baseline", dict()).get("custom_gems", False): # TODO: Fix this stuff to work without docker - logger.info( - "Installing custom gems to docker volume: buildstockbatch_custom_gems" - ) + logger.info("Installing custom gems to docker volume: buildstockbatch_custom_gems") docker_client = docker.client.from_env() # Create a volume to store the custom gems - docker_client.volumes.create( - name="buildstockbatch_custom_gems", driver="local" - ) - simdata_vol = docker_client.volumes.create( - name="buildstockbatch_simdata_temp", driver="local" - ) + docker_client.volumes.create(name="buildstockbatch_custom_gems", driver="local") + simdata_vol = docker_client.volumes.create(name="buildstockbatch_simdata_temp", driver="local") # Define directories to be mounted in the container mnt_gem_dir = "/var/oscli/gems" # Install custom gems to be used in the docker container - local_gemfile_path = os.path.join( - self.buildstock_dir, "resources", "Gemfile" - ) + local_gemfile_path = os.path.join(self.buildstock_dir, "resources", "Gemfile") mnt_gemfile_path_orig = "/var/oscli/gemfile/Gemfile" docker_volume_mounts = { "buildstockbatch_custom_gems": {"bind": mnt_gem_dir, "mode": "rw"}, @@ -87,14 +77,10 @@ def __init__(self, project_filename): # Check that the Gemfile exists if not os.path.exists(local_gemfile_path): print(f"local_gemfile_path = {local_gemfile_path}") - raise AttributeError( - "baseline:custom_gems = True, but did not find Gemfile in /resources directory" - ) + raise AttributeError("baseline:custom_gems = True, but did not find Gemfile in /resources directory") # Make the buildstock/resources/.custom_gems dir to store logs - local_log_dir = os.path.join( - self.buildstock_dir, "resources", ".custom_gems" - ) + local_log_dir = os.path.join(self.buildstock_dir, "resources", ".custom_gems") if not os.path.exists(local_log_dir): os.makedirs(local_log_dir) @@ -109,9 +95,7 @@ def __init__(self, project_filename): volumes=docker_volume_mounts, name="install_custom_gems", ) - with open( - os.path.join(local_log_dir, "bundle_install_output.log"), "wb" - ) as f_out: + with open(os.path.join(local_log_dir, "bundle_install_output.log"), "wb") as f_out: f_out.write(container_output) # Report out custom gems loaded by OpenStudio CLI @@ -160,33 +144,25 @@ def run_building( upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1 try: - sim_id, sim_dir = cls.make_sim_dir( - i, upgrade_idx, os.path.join(results_dir, "simulation_output") - ) + sim_id, sim_dir = cls.make_sim_dir(i, upgrade_idx, os.path.join(results_dir, "simulation_output")) except SimulationExists: return sim_path = pathlib.Path(sim_dir) buildstock_path = pathlib.Path(buildstock_dir) # Make symlinks to project and buildstock stuff - (sim_path / "measures").symlink_to( - buildstock_path / "measures", target_is_directory=True - ) + (sim_path / "measures").symlink_to(buildstock_path / "measures", target_is_directory=True) (sim_path / "lib").symlink_to(buildstock_path / "lib", target_is_directory=True) (sim_path / "weather").symlink_to(weather_dir, target_is_directory=True) hpxml_measures_path = buildstock_path / "resources" / "hpxml-measures" if hpxml_measures_path.exists(): resources_path = sim_path / "resources" resources_path.mkdir() - (resources_path / "hpxml-measures").symlink_to( - hpxml_measures_path, target_is_directory=True - ) + (resources_path / "hpxml-measures").symlink_to(hpxml_measures_path, target_is_directory=True) else: resources_path = None - osw = cls.create_osw( - cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx - ) + osw = cls.create_osw(cfg, n_datapoints, sim_id, building_id=i, upgrade_idx=upgrade_idx) with open(sim_path / "in.osw", "w") as f: json.dump(osw, f, indent=4) @@ -276,9 +252,7 @@ def run_building( # Read data_point_out.json reporting_measures = cls.get_reporting_measures(cfg) - dpout = postprocessing.read_simulation_outputs( - fs, reporting_measures, sim_dir, upgrade_id, i - ) + dpout = postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, i) return dpout def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): @@ -317,9 +291,7 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): ) upgrade_sims = [] for i in range(len(self.cfg.get("upgrades", []))): - upgrade_sims.append( - map(functools.partial(run_building_d, upgrade_idx=i), building_ids) - ) + upgrade_sims.append(map(functools.partial(run_building_d, upgrade_idx=i), building_ids)) if not self.skip_baseline_sims: baseline_sims = map(run_building_d, building_ids) all_sims = itertools.chain(baseline_sims, *upgrade_sims) @@ -353,18 +325,14 @@ def output_dir(self): @property def results_dir(self): - results_dir = self.cfg.get( - "output_directory", os.path.join(self.project_dir, "localResults") - ) + results_dir = self.cfg.get("output_directory", os.path.join(self.project_dir, "localResults")) results_dir = self.path_rel_to_projectfile(results_dir) if not os.path.isdir(results_dir): os.makedirs(results_dir) return results_dir def get_dask_client(self): - cluster = LocalCluster( - local_directory=os.path.join(self.results_dir, "dask-tmp") - ) + cluster = LocalCluster(local_directory=os.path.join(self.results_dir, "dask-tmp")) return Client(cluster) @@ -425,8 +393,7 @@ def main(): ) group.add_argument( "--uploadonly", - help="Only upload to S3, useful when postprocessing is already done. Ignores the " - "upload flag in yaml", + help="Only upload to S3, useful when postprocessing is already done. Ignores the " "upload flag in yaml", action="store_true", ) group.add_argument( @@ -434,14 +401,10 @@ def main(): help="Only validate the project YAML file and references. Nothing is executed", action="store_true", ) - group.add_argument( - "--samplingonly", help="Run the sampling only.", action="store_true" - ) + group.add_argument("--samplingonly", help="Run the sampling only.", action="store_true") args = parser.parse_args() if not os.path.isfile(args.project_filename): - raise FileNotFoundError( - f"The project file {args.project_filename} doesn't exist" - ) + raise FileNotFoundError(f"The project file {args.project_filename} doesn't exist") # Validate the project, and in case of the --validateonly flag return True if validation passes LocalBatch.validate_project(args.project_filename) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 8138cee9..0937185c 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -134,9 +134,7 @@ def read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, buildin :return: dpout [dict] """ - dpout = read_data_point_out_json( - fs, reporting_measures, f"{sim_dir}/run/data_point_out.json" - ) + dpout = read_data_point_out_json(fs, reporting_measures, f"{sim_dir}/run/data_point_out.json") if dpout is None: dpout = {} else: @@ -168,15 +166,10 @@ def clean_up_results_df(df, cfg, keep_upgrade_id=False): del results_df[col] for col in ("started_at", "completed_at"): if col in results_df.columns: - results_df[col] = pd.to_datetime( - results_df[col], format="%Y%m%dT%H%M%SZ" - ).astype(pd.ArrowDtype(pa.timestamp("s"))) - reference_scenarios = dict( - [ - (i, x.get("reference_scenario")) - for i, x in enumerate(cfg.get("upgrades", []), 1) - ] - ) + results_df[col] = pd.to_datetime(results_df[col], format="%Y%m%dT%H%M%SZ").astype( + pd.ArrowDtype(pa.timestamp("s")) + ) + reference_scenarios = dict([(i, x.get("reference_scenario")) for i, x in enumerate(cfg.get("upgrades", []), 1)]) results_df["apply_upgrade.reference_scenario"] = ( results_df["upgrade"].map(reference_scenarios).fillna("").astype(str) ) @@ -196,26 +189,10 @@ def clean_up_results_df(df, cfg, keep_upgrade_id=False): if "job_id" in results_df.columns: first_few_cols.insert(2, "job_id") - build_existing_model_cols = sorted( - [col for col in results_df.columns if col.startswith("build_existing_model")] - ) - sim_output_report_cols = sorted( - [ - col - for col in results_df.columns - if col.startswith("simulation_output_report") - ] - ) - report_sim_output_cols = sorted( - [ - col - for col in results_df.columns - if col.startswith("report_simulation_output") - ] - ) - upgrade_costs_cols = sorted( - [col for col in results_df.columns if col.startswith("upgrade_costs")] - ) + build_existing_model_cols = sorted([col for col in results_df.columns if col.startswith("build_existing_model")]) + sim_output_report_cols = sorted([col for col in results_df.columns if col.startswith("simulation_output_report")]) + report_sim_output_cols = sorted([col for col in results_df.columns if col.startswith("report_simulation_output")]) + upgrade_costs_cols = sorted([col for col in results_df.columns if col.startswith("upgrade_costs")]) sorted_cols = ( first_few_cols + build_existing_model_cols @@ -282,9 +259,7 @@ def read_enduse_timeseries_parquet(fs, all_cols, src_path, bldg_id): return df -def concat_and_normalize( - fs, all_cols, src_path, dst_path, partition_columns, indx, bldg_ids, partition_vals -): +def concat_and_normalize(fs, all_cols, src_path, dst_path, partition_columns, indx, bldg_ids, partition_vals): dfs = [] for bldg_id in sorted(bldg_ids): df = read_enduse_timeseries_parquet(fs, all_cols, src_path, bldg_id) @@ -358,22 +333,12 @@ def get_partitioned_bldg_groups(partition_df, partition_columns, files_per_parti """ total_building = len(partition_df) if partition_columns: - bldg_id_list_df = ( - partition_df.reset_index() - .groupby(partition_columns)["building_id"] - .apply(list) - ) + bldg_id_list_df = partition_df.reset_index().groupby(partition_columns)["building_id"].apply(list) ngroups = len(bldg_id_list_df) bldg_id_list = bldg_id_list_df.sum() - nfiles_in_each_group = [ - nfiles for nfiles in bldg_id_list_df.map(lambda x: len(x)) - ] - files_groups = [ - split_into_groups(n, files_per_partition) for n in nfiles_in_each_group - ] - flat_groups = [ - n for group in files_groups for n in group - ] # flatten list of list into a list (maintain order) + nfiles_in_each_group = [nfiles for nfiles in bldg_id_list_df.map(lambda x: len(x))] + files_groups = [split_into_groups(n, files_per_partition) for n in nfiles_in_each_group] + flat_groups = [n for group in files_groups for n in group] # flatten list of list into a list (maintain order) else: # no partitioning by a column. Just put buildings into groups of files_per_partition ngroups = 1 @@ -413,9 +378,7 @@ def write_metadata_files(fs, parquet_root_dir, partition_columns): concat_files = fs.glob(glob_str) logger.info(f"Gathered {len(concat_files)} files. Now writing _metadata") parquet_root_dir = Path(parquet_root_dir).as_posix() - create_metadata_file( - concat_files, root_dir=parquet_root_dir, engine="pyarrow", fs=fs - ) + create_metadata_file(concat_files, root_dir=parquet_root_dir, engine="pyarrow", fs=fs) logger.info(f"_metadata file written to {parquet_root_dir}") @@ -449,9 +412,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): if not results_json_files: raise ValueError("No simulation results found to post-process.") - logger.info( - "Collecting all the columns and datatypes in results_job*.json.gz parquet files." - ) + logger.info("Collecting all the columns and datatypes in results_job*.json.gz parquet files.") all_schema_dict = ( db.from_sequence(results_json_files) .map(partial(get_schema_dict, fs)) @@ -460,13 +421,10 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ) logger.info(f"Got {len(all_schema_dict)} columns") all_results_cols = list(all_schema_dict.keys()) - all_schema_dict = { - to_camelcase(key): value for key, value in all_schema_dict.items() - } + all_schema_dict = {to_camelcase(key): value for key, value in all_schema_dict.items()} logger.info(f"Got this schema: {all_schema_dict}\n") delayed_results_dfs = [ - dask.delayed(partial(read_results_json, fs, all_cols=all_results_cols))(x) - for x in results_json_files + dask.delayed(partial(read_results_json, fs, all_cols=all_results_cols))(x) for x in results_json_files ] results_df = dd.from_delayed(delayed_results_dfs, verify_meta=False) @@ -479,25 +437,15 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ts_filenames = fs.ls(upgrade_folder) if ts_filenames: do_timeseries = True - logger.info( - f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}." - ) + logger.info(f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}.") files_bag = db.from_sequence(ts_filenames, partition_size=100) - all_ts_cols |= ( - files_bag.map(partial(get_cols, fs)) - .fold(lambda x, y: x.union(y)) - .compute() - ) + all_ts_cols |= files_bag.map(partial(get_cols, fs)).fold(lambda x, y: x.union(y)).compute() logger.info("Collected all the columns") else: - logger.info( - f"There are no timeseries files for upgrade {Path(upgrade_folder).name}." - ) + logger.info(f"There are no timeseries files for upgrade {Path(upgrade_folder).name}.") # Sort the columns - all_ts_cols_sorted = ["building_id"] + sorted( - x for x in all_ts_cols if x.startswith("time") - ) + all_ts_cols_sorted = ["building_id"] + sorted(x for x in all_ts_cols if x.startswith("time")) all_ts_cols.difference_update(all_ts_cols_sorted) all_ts_cols_sorted.extend(sorted(x for x in all_ts_cols if not x.endswith("]"))) all_ts_cols.difference_update(all_ts_cols_sorted) @@ -514,9 +462,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): df_partition_columns = [f"build_existing_model.{c}" for c in partition_columns] missing_cols = set(df_partition_columns) - set(all_schema_dict.keys()) if missing_cols: - raise ValueError( - f"The following partitioning columns are not found in results.json: {missing_cols}" - ) + raise ValueError(f"The following partitioning columns are not found in results.json: {missing_cols}") if partition_columns: logger.info(f"The timeseries files will be partitioned by {partition_columns}.") @@ -533,16 +479,12 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): schema = None partition_df = df[df_partition_columns].copy() partition_df.rename( - columns={ - df_c: c for df_c, c in zip(df_partition_columns, partition_columns) - }, + columns={df_c: c for df_c, c in zip(df_partition_columns, partition_columns)}, inplace=True, ) if upgrade_id > 0: # Remove building characteristics for upgrade scenarios. - cols_to_keep = list( - filter(lambda x: not x.startswith("build_existing_model."), df.columns) - ) + cols_to_keep = list(filter(lambda x: not x.startswith("build_existing_model."), df.columns)) df = df[cols_to_keep] null_cols = get_null_cols(df) # If certain column datatype is null (happens when it doesn't have any data), the datatype @@ -551,13 +493,9 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): logger.info(f"Upgrade {upgrade_id} has null cols: {null_cols}") schema, unresolved = correct_schema(all_schema_dict, df) if unresolved: - logger.info( - f"The types for {unresolved} columns couldn't be determined." - ) + logger.info(f"The types for {unresolved} columns couldn't be determined.") else: - logger.info( - "All columns were successfully assigned a datatype based on other upgrades." - ) + logger.info("All columns were successfully assigned a datatype based on other upgrades.") # Write CSV csv_filename = f"{results_csvs_dir}/results_up{upgrade_id:02d}.csv.gz" logger.info(f"Writing {csv_filename}") @@ -574,63 +512,37 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): fs.makedirs(results_parquet_dir) parquet_filename = f"{results_parquet_dir}/results_up{upgrade_id:02d}.parquet" logger.info(f"Writing {parquet_filename}") - write_dataframe_as_parquet( - df.reset_index(), fs, parquet_filename, schema=schema - ) + write_dataframe_as_parquet(df.reset_index(), fs, parquet_filename, schema=schema) if do_timeseries: # Get the names of the timeseries file for each simulation in this upgrade ts_upgrade_path = f"{ts_in_dir}/up{upgrade_id:02d}" try: - ts_filenames = [ - ts_upgrade_path + ts_filename - for ts_filename in fs.ls(ts_upgrade_path) - ] + ts_filenames = [ts_upgrade_path + ts_filename for ts_filename in fs.ls(ts_upgrade_path)] except FileNotFoundError: # Upgrade directories may be empty if the upgrade is invalid. In some cloud # filesystems, there aren't actual directories, and trying to list a directory with # no files in it can fail. Just continue post-processing (other upgrades). - logger.warning( - f"Listing '{ts_upgrade_path}' failed. Skipping this upgrade." - ) + logger.warning(f"Listing '{ts_upgrade_path}' failed. Skipping this upgrade.") continue - ts_bldg_ids = [ - int(re.search(r"bldg(\d+).parquet", flname).group(1)) - for flname in ts_filenames - ] + ts_bldg_ids = [int(re.search(r"bldg(\d+).parquet", flname).group(1)) for flname in ts_filenames] if not ts_filenames: - logger.warning( - f"There are no timeseries files for upgrade{upgrade_id}." - ) + logger.warning(f"There are no timeseries files for upgrade{upgrade_id}.") continue - logger.info( - f"There are {len(ts_filenames)} timeseries files for upgrade{upgrade_id}." - ) + logger.info(f"There are {len(ts_filenames)} timeseries files for upgrade{upgrade_id}.") # Calculate the mean and estimate the total memory usage - read_ts_parquet = partial( - read_enduse_timeseries_parquet, fs, all_ts_cols_sorted, ts_upgrade_path - ) - get_ts_mem_usage_d = dask.delayed( - lambda x: read_ts_parquet(x).memory_usage(deep=True).sum() - ) + read_ts_parquet = partial(read_enduse_timeseries_parquet, fs, all_ts_cols_sorted, ts_upgrade_path) + get_ts_mem_usage_d = dask.delayed(lambda x: read_ts_parquet(x).memory_usage(deep=True).sum()) sample_size = min(len(ts_bldg_ids), 36 * 3) - mean_mem = np.mean( - dask.compute( - map(get_ts_mem_usage_d, random.sample(ts_bldg_ids, sample_size)) - )[0] - ) + mean_mem = np.mean(dask.compute(map(get_ts_mem_usage_d, random.sample(ts_bldg_ids, sample_size)))[0]) # Determine how many files should be in each partition and group the files parquet_memory = int( - cfg.get("eagle", {}) - .get("postprocessing", {}) - .get("parquet_memory_mb", MAX_PARQUET_MEMORY) + cfg.get("eagle", {}).get("postprocessing", {}).get("parquet_memory_mb", MAX_PARQUET_MEMORY) ) logger.info(f"Max parquet memory: {parquet_memory} MB") - max_files_per_partition = max( - 1, math.floor(parquet_memory / (mean_mem / 1e6)) - ) + max_files_per_partition = max(1, math.floor(parquet_memory / (mean_mem / 1e6))) partition_df = partition_df.loc[ts_bldg_ids].copy() logger.info(f"partition_df for the upgrade has {len(partition_df)} rows.") bldg_id_groups, bldg_id_list, ngroup = get_partitioned_bldg_groups( @@ -649,9 +561,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ts_out_loc = f"s3://{ts_dir}/upgrade={upgrade_id}" fs.makedirs(ts_out_loc) - logger.info( - f"Created directory {ts_out_loc} for writing. Now concatenating ..." - ) + logger.info(f"Created directory {ts_out_loc} for writing. Now concatenating ...") src_path = f"{ts_in_dir}/up{upgrade_id:02d}" concat_partial = dask.delayed( @@ -665,11 +575,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): ) ) partition_vals_list = [ - ( - list(partition_df.loc[bldg_id_list[0]].values) - if partition_columns - else [] - ) + (list(partition_df.loc[bldg_id_list[0]].values) if partition_columns else []) for bldg_id_list in bldg_id_groups ] @@ -689,9 +595,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): f"{results_dir}/dask_combine_report{upgrade_id}.html", ) - logger.info( - f"Finished combining and saving timeseries for upgrade{upgrade_id}." - ) + logger.info(f"Finished combining and saving timeseries for upgrade{upgrade_id}.") logger.info("All aggregation completed. ") if do_timeseries: logger.info("Writing timeseries metadata files") @@ -717,9 +621,7 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): parquet_dir = Path(results_dir).joinpath("parquet") ts_dir = parquet_dir / "timeseries" if not parquet_dir.is_dir(): - logger.error( - f"{parquet_dir} does not exist. Please make sure postprocessing has been done." - ) + logger.error(f"{parquet_dir} does not exist. Please make sure postprocessing has been done.") raise FileNotFoundError(parquet_dir) all_files = [] @@ -731,9 +633,7 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): s3_prefix = aws_conf.get("s3", {}).get("prefix", "").rstrip("/") s3_bucket = aws_conf.get("s3", {}).get("bucket", None) if not (s3_prefix and s3_bucket): - logger.error( - "YAML file missing postprocessing:aws:s3:prefix and/or bucket entry." - ) + logger.error("YAML file missing postprocessing:aws:s3:prefix and/or bucket entry.") return s3_prefix_output = s3_prefix + "/" + output_folder_name + "/" @@ -741,15 +641,11 @@ def upload_results(aws_conf, output_dir, results_dir, buildstock_csv_filename): bucket = s3.Bucket(s3_bucket) n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix_output))) if n_existing_files > 0: - logger.error( - f"There are already {n_existing_files} files in the s3 folder {s3_bucket}/{s3_prefix_output}." - ) + logger.error(f"There are already {n_existing_files} files in the s3 folder {s3_bucket}/{s3_prefix_output}.") raise FileExistsError(f"s3://{s3_bucket}/{s3_prefix_output}") def upload_file(filepath, s3key=None): - full_path = ( - filepath if filepath.is_absolute() else parquet_dir.joinpath(filepath) - ) + full_path = filepath if filepath.is_absolute() else parquet_dir.joinpath(filepath) s3 = boto3.resource("s3") bucket = s3.Bucket(s3_bucket) if s3key is None: @@ -769,9 +665,7 @@ def upload_file(filepath, s3key=None): else: logger.warning(f"{buildstock_csv_filename} doesn't exist, can't upload.") dask.compute(tasks) - logger.info( - f"Upload to S3 completed. The files are uploaded to: {s3_bucket}/{s3_prefix_output}" - ) + logger.info(f"Upload to S3 completed. The files are uploaded to: {s3_bucket}/{s3_prefix_output}") return s3_bucket, s3_prefix_output @@ -780,9 +674,7 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): region_name = aws_conf.get("region_name", "us-west-2") db_name = aws_conf.get("athena", {}).get("database_name", None) - role = aws_conf.get("athena", {}).get( - "glue_service_role", "service-role/AWSGlueServiceRole-default" - ) + role = aws_conf.get("athena", {}).get("glue_service_role", "service-role/AWSGlueServiceRole-default") max_crawling_time = aws_conf.get("athena", {}).get("max_crawling_time", 600) assert db_name, "athena:database_name not supplied" @@ -792,20 +684,12 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): s3_path = f"s3://{s3_bucket}/{s3_prefix}" n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix))) if n_existing_files == 0: - logger.warning( - f"There are no files in {s3_path}, Athena tables will not be created as intended" - ) + logger.warning(f"There are no files in {s3_path}, Athena tables will not be created as intended") return glueClient = boto3.client("glue", region_name=region_name) crawlTarget = { - "S3Targets": [ - { - "Path": s3_path, - "Exclusions": ["**_metadata", "**_common_metadata"], - "SampleSize": 2, - } - ] + "S3Targets": [{"Path": s3_path, "Exclusions": ["**_metadata", "**_common_metadata"], "SampleSize": 2}] } crawler_name = db_name + "_" + tbl_prefix tbl_prefix = tbl_prefix + "_" @@ -824,26 +708,18 @@ def create_crawler(): except glueClient.exceptions.AlreadyExistsException: logger.info(f"Deleting existing crawler: {crawler_name}. And creating new one.") glueClient.delete_crawler(Name=crawler_name) - time.sleep( - 1 - ) # A small delay after deleting is required to prevent AlreadyExistsException again + time.sleep(1) # A small delay after deleting is required to prevent AlreadyExistsException again create_crawler() try: - existing_tables = [ - x["Name"] for x in glueClient.get_tables(DatabaseName=db_name)["TableList"] - ] + existing_tables = [x["Name"] for x in glueClient.get_tables(DatabaseName=db_name)["TableList"]] except glueClient.exceptions.EntityNotFoundException: existing_tables = [] to_be_deleted_tables = [x for x in existing_tables if x.startswith(tbl_prefix)] if to_be_deleted_tables: - logger.info( - f"Deleting existing tables in db {db_name}: {to_be_deleted_tables}. And creating new ones." - ) - glueClient.batch_delete_table( - DatabaseName=db_name, TablesToDelete=to_be_deleted_tables - ) + logger.info(f"Deleting existing tables in db {db_name}: {to_be_deleted_tables}. And creating new ones.") + glueClient.batch_delete_table(DatabaseName=db_name, TablesToDelete=to_be_deleted_tables) glueClient.start_crawler(Name=crawler_name) logger.info("Crawler started") @@ -863,9 +739,7 @@ def create_crawler(): logger.debug("Waiting for crawler to stop") else: assert crawler_state == "READY" - metrics = glueClient.get_crawler_metrics(CrawlerNameList=[crawler_name])[ - "CrawlerMetricsList" - ][0] + metrics = glueClient.get_crawler_metrics(CrawlerNameList=[crawler_name])["CrawlerMetricsList"][0] logger.info(f"Crawler has completed running. It is {crawler_state}.") logger.info( f"TablesCreated: {metrics['TablesCreated']} " @@ -879,7 +753,5 @@ def create_crawler(): try: glueClient.delete_crawler(Name=crawler_name) except botocore.exceptions.ClientError as error: - logger.error( - f"Could not delete crawler {crawler_name}. Please delete it manually from the AWS console." - ) + logger.error(f"Could not delete crawler {crawler_name}. Please delete it manually from the AWS console.") raise error diff --git a/buildstockbatch/sampler/base.py b/buildstockbatch/sampler/base.py index 41585a0a..4757a837 100644 --- a/buildstockbatch/sampler/base.py +++ b/buildstockbatch/sampler/base.py @@ -46,20 +46,14 @@ def __init__(self, parent): :param parent: The BuildStockBatchBase object that owns this sampler. """ - self.parent = weakref.ref( - parent - ) # This removes circular references and allows garbage collection to work. + self.parent = weakref.ref(parent) # This removes circular references and allows garbage collection to work. if self.container_runtime in ( ContainerRuntime.DOCKER, ContainerRuntime.LOCAL_OPENSTUDIO, ): - self.csv_path = os.path.join( - self.project_dir, "housing_characteristics", "buildstock.csv" - ) + self.csv_path = os.path.join(self.project_dir, "housing_characteristics", "buildstock.csv") elif self.container_runtime == ContainerRuntime.APPTAINER: - self.csv_path = os.path.join( - self.parent().output_dir, "housing_characteristics", "buildstock.csv" - ) + self.csv_path = os.path.join(self.parent().output_dir, "housing_characteristics", "buildstock.csv") else: self.csv_path = None diff --git a/buildstockbatch/sampler/commercial_sobol.py b/buildstockbatch/sampler/commercial_sobol.py index aff5563e..e7ac35cd 100644 --- a/buildstockbatch/sampler/commercial_sobol.py +++ b/buildstockbatch/sampler/commercial_sobol.py @@ -62,10 +62,7 @@ def validate_args(cls, project_filename, **kw): else: raise ValidationError(f"Unknown argument for sampler: {k}") if len(expected_args) > 0: - raise ValidationError( - "The following sampler arguments are required: " - + ", ".join(expected_args) - ) + raise ValidationError("The following sampler arguments are required: " + ", ".join(expected_args)) return True def run_sampling(self): @@ -87,15 +84,11 @@ def run_sampling(self): for tsv_file in os.listdir(self.buildstock_dir): if ".tsv" in tsv_file: tsv_df = read_csv(os.path.join(self.buildstock_dir, tsv_file), sep="\t") - dependency_columns = [ - item for item in list(tsv_df) if "Dependency=" in item - ] + dependency_columns = [item for item in list(tsv_df) if "Dependency=" in item] tsv_df[dependency_columns] = tsv_df[dependency_columns].astype("str") tsv_hash[tsv_file.replace(".tsv", "")] = tsv_df dependency_hash, attr_order = self._com_order_tsvs(tsv_hash) - sample_matrix = self._com_execute_sobol_sampling( - attr_order.__len__(), sample_number - ) + sample_matrix = self._com_execute_sobol_sampling(attr_order.__len__(), sample_number) csv_path = self.csv_path header = "Building," for item in attr_order: @@ -131,9 +124,7 @@ def _com_execute_sobol_sampling(n_dims, n_samples): :param n_samples: Number of samples to calculate :return: Pandas DataFrame object which contains the low discrepancy result of the sobol algorithm """ - return pd.DataFrame(i4_sobol_generate(n_dims, n_samples, 0)).replace( - 1.0, 0.999999 - ) + return pd.DataFrame(i4_sobol_generate(n_dims, n_samples, 0)).replace(1.0, 0.999999) @staticmethod def _com_order_tsvs(tsv_hash): @@ -146,9 +137,7 @@ def _com_order_tsvs(tsv_hash): dependency_hash = {} for attr in tsv_hash.keys(): dependency_hash[attr] = [ - item.replace("Dependency=", "") - for item in list(tsv_hash[attr]) - if "Dependency=" in item + item.replace("Dependency=", "") for item in list(tsv_hash[attr]) if "Dependency=" in item ] attr_order = [] for attr in dependency_hash.keys(): @@ -170,9 +159,7 @@ def _com_order_tsvs(tsv_hash): elif max_iterations > 0: max_iterations -= 1 else: - raise RuntimeError( - "Unable to resolve the dependency tree within the set iteration limit" - ) + raise RuntimeError("Unable to resolve the dependency tree within the set iteration limit") return dependency_hash, attr_order @staticmethod @@ -206,8 +193,7 @@ def _com_execute_sample( tsv_dist_val = sample_vector[attr_index] for dependency in sample_dependency_hash[attr]: tsv_lkup = tsv_lkup.loc[ - tsv_lkup.loc[:, "Dependency=" + dependency] - == sample_dependency_hash[dependency] + tsv_lkup.loc[:, "Dependency=" + dependency] == sample_dependency_hash[dependency] ] tsv_lkup = tsv_lkup.drop("Dependency=" + dependency, axis=1) if tsv_lkup.shape[0] == 0: @@ -218,17 +204,9 @@ def _com_execute_sample( ) return if tsv_lkup.shape[0] != 1: - raise RuntimeError( - "Unable to reduce tsv for {} to 1 row, index {}".format( - attr, sample_index - ) - ) + raise RuntimeError("Unable to reduce tsv for {} to 1 row, index {}".format(attr, sample_index)) tsv_lkup_cdf = tsv_lkup.values.cumsum() > tsv_dist_val - option_values = [ - item.replace("Option=", "") - for item in list(tsv_lkup) - if "Option=" in item - ] + option_values = [item.replace("Option=", "") for item in list(tsv_lkup) if "Option=" in item] attr_result = list(compress(option_values, tsv_lkup_cdf))[0] sample_dependency_hash[attr] = attr_result result_vector.append(attr_result) diff --git a/buildstockbatch/sampler/downselect.py b/buildstockbatch/sampler/downselect.py index a634ea4e..64375ab7 100644 --- a/buildstockbatch/sampler/downselect.py +++ b/buildstockbatch/sampler/downselect.py @@ -43,11 +43,7 @@ def __init__(self, parent, n_datapoints, logic, resample=True, **kw): """ super().__init__(parent) self.validate_args( - self.parent().project_filename, - n_datapoints=n_datapoints, - logic=logic, - resample=resample, - **kw + self.parent().project_filename, n_datapoints=n_datapoints, logic=logic, resample=resample, **kw ) self.logic = logic self.resample = resample @@ -70,10 +66,7 @@ def validate_args(cls, project_filename, **kw): else: extra_kw[k] = v if len(expected_args) > 0: - raise ValidationError( - "The following sampler arguments are required: " - + ", ".join(expected_args) - ) + raise ValidationError("The following sampler arguments are required: " + ", ".join(expected_args)) cls.SUB_SAMPLER_CLASS.validate_args(project_filename, **extra_kw) return True @@ -106,31 +99,21 @@ def downselect_logic(cls, df, logic): def run_sampling(self): if self.resample: - logger.debug( - "Performing initial sampling to figure out number of samples for downselect" - ) + logger.debug("Performing initial sampling to figure out number of samples for downselect") n_samples_init = 350000 - init_sampler = self.SUB_SAMPLER_CLASS( - self.parent(), n_datapoints=n_samples_init, **self.sub_kw - ) + init_sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples_init, **self.sub_kw) buildstock_csv_filename = init_sampler.run_sampling() df = read_csv(buildstock_csv_filename, index_col=0, dtype=str) df_new = df[self.downselect_logic(df, self.logic)] downselected_n_samples_init = df_new.shape[0] - n_samples = math.ceil( - self.n_datapoints * n_samples_init / downselected_n_samples_init - ) + n_samples = math.ceil(self.n_datapoints * n_samples_init / downselected_n_samples_init) os.remove(buildstock_csv_filename) del init_sampler else: n_samples = self.n_datapoints - sampler = self.SUB_SAMPLER_CLASS( - self.parent(), n_datapoints=n_samples, **self.sub_kw - ) + sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples, **self.sub_kw) buildstock_csv_filename = sampler.run_sampling() - with gzip.open( - os.path.splitext(buildstock_csv_filename)[0] + "_orig.csv.gz", "wb" - ) as f_out: + with gzip.open(os.path.splitext(buildstock_csv_filename)[0] + "_orig.csv.gz", "wb") as f_out: with open(buildstock_csv_filename, "rb") as f_in: shutil.copyfileobj(f_in, f_out) df = read_csv(buildstock_csv_filename, index_col=0, dtype="str") diff --git a/buildstockbatch/sampler/residential_quota.py b/buildstockbatch/sampler/residential_quota.py index 7eb270e8..73d9b185 100644 --- a/buildstockbatch/sampler/residential_quota.py +++ b/buildstockbatch/sampler/residential_quota.py @@ -50,10 +50,7 @@ def validate_args(cls, project_filename, **kw): else: raise ValidationError(f"Unknown argument for sampler: {k}") if len(expected_args) > 0: - raise ValidationError( - "The following sampler arguments are required: " - + ", ".join(expected_args) - ) + raise ValidationError("The following sampler arguments are required: " + ", ".join(expected_args)) return True def _run_sampling_docker(self): @@ -75,9 +72,7 @@ def _run_sampling_docker(self): "buildstock.csv", ], remove=True, - volumes={ - self.buildstock_dir: {"bind": "/var/simdata/openstudio", "mode": "rw"} - }, + volumes={self.buildstock_dir: {"bind": "/var/simdata/openstudio", "mode": "rw"}}, name="buildstock_sampling", **extra_kws, ) diff --git a/buildstockbatch/test/conftest.py b/buildstockbatch/test/conftest.py index 53fc02aa..6a33e507 100644 --- a/buildstockbatch/test/conftest.py +++ b/buildstockbatch/test/conftest.py @@ -11,9 +11,7 @@ def basic_residential_project_file(): with tempfile.TemporaryDirectory() as test_directory: - def _basic_residential_project_file( - update_args={}, raw=False, hpc_name="eagle" - ): + def _basic_residential_project_file(update_args={}, raw=False, hpc_name="eagle"): output_dir = "simulations_job0" if raw else "simulation_output" buildstock_directory = os.path.join(test_directory, "openstudio_buildstock") shutil.copytree( @@ -37,22 +35,14 @@ def _basic_residential_project_file( ) # move the job*.json file to appropriate location - if os.path.exists( - os.path.join(output_directory, "simulation_output", "job0.json") - ): + if os.path.exists(os.path.join(output_directory, "simulation_output", "job0.json")): shutil.move( os.path.join(output_directory, "simulation_output", "job0.json"), - os.path.join( - output_directory, "simulation_output", "..", "..", "job0.json" - ), + os.path.join(output_directory, "simulation_output", "..", "..", "job0.json"), ) os.mkdir(os.path.join(output_directory, "housing_characteristics")) - os.mkdir( - os.path.join( - buildstock_directory, project_directory, "housing_characteristics" - ) - ) + os.mkdir(os.path.join(buildstock_directory, project_directory, "housing_characteristics")) weather_file_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_inputs", diff --git a/buildstockbatch/test/shared_testing_stuff.py b/buildstockbatch/test/shared_testing_stuff.py index 479f439e..93470776 100644 --- a/buildstockbatch/test/shared_testing_stuff.py +++ b/buildstockbatch/test/shared_testing_stuff.py @@ -11,9 +11,7 @@ pathlib.Path(__file__).resolve().parent.parent.parent.parent / "resstock", ) ) -resstock_required = pytest.mark.skipif( - not resstock_directory.exists(), reason="ResStock checkout is not found" -) +resstock_required = pytest.mark.skipif(not resstock_directory.exists(), reason="ResStock checkout is not found") def check_docker_available(): @@ -25,6 +23,4 @@ def check_docker_available(): return True -docker_available = pytest.mark.skipif( - not check_docker_available(), reason="Docker isn't running on this machine" -) +docker_available = pytest.mark.skipif(not check_docker_available(), reason="Docker isn't running on this machine") diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py index 52019ce9..5dbefcbd 100644 --- a/buildstockbatch/test/test_base.py +++ b/buildstockbatch/test/test_base.py @@ -45,25 +45,16 @@ def test_reference_scenario(basic_residential_project_file): with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( BuildStockBatchBase, "get_dask_client" - ) as get_dask_client_mock, patch.object( - BuildStockBatchBase, "results_dir", results_dir - ): + ) as get_dask_client_mock, patch.object(BuildStockBatchBase, "results_dir", results_dir): bsb = BuildStockBatchBase(project_filename) bsb.process_results() get_dask_client_mock.assert_called_once() # test results.csv files test_path = os.path.join(results_dir, "results_csvs") - test_csv = ( - read_csv(os.path.join(test_path, "results_up01.csv.gz")) - .set_index("building_id") - .sort_index() - ) + test_csv = read_csv(os.path.join(test_path, "results_up01.csv.gz")).set_index("building_id").sort_index() assert len(test_csv["apply_upgrade.reference_scenario"].unique()) == 1 - assert ( - test_csv["apply_upgrade.reference_scenario"].iloc[0] - == "example_reference_scenario" - ) + assert test_csv["apply_upgrade.reference_scenario"].iloc[0] == "example_reference_scenario" def test_downselect_integer_options(basic_residential_project_file, mocker): @@ -80,9 +71,7 @@ def test_downselect_integer_options(basic_residential_project_file, mocker): col_idx = row.index("Days Shifted") else: # Convert values from "Day1" to "1.10" so we hit the bug - row[col_idx] = "{0}.{0}0".format( - re.search(r"Day(\d+)", row[col_idx]).group(1) - ) + row[col_idx] = "{0}.{0}0".format(re.search(r"Day(\d+)", row[col_idx]).group(1)) valid_option_values.add(row[col_idx]) cf_out.writerow(row) @@ -100,9 +89,7 @@ def test_downselect_integer_options(basic_residential_project_file, mocker): ) mocker.patch.object(BuildStockBatchBase, "weather_dir", None) mocker.patch.object(BuildStockBatchBase, "results_dir", results_dir) - sampler_property_mock = mocker.patch.object( - BuildStockBatchBase, "sampler", new_callable=PropertyMock - ) + sampler_property_mock = mocker.patch.object(BuildStockBatchBase, "sampler", new_callable=PropertyMock) sampler_mock = mocker.MagicMock() sampler_property_mock.return_value = sampler_mock sampler_mock.run_sampling = MagicMock(return_value=buildstock_csv) @@ -143,9 +130,7 @@ def test_upload_files(mocker, basic_residential_project_file): } mocked_glueclient = MagicMock() mocked_glueclient.get_crawler = MagicMock( - return_value={ - "Crawler": {"State": "READY", "LastCrawl": {"Status": "SUCCEEDED"}} - } + return_value={"Crawler": {"State": "READY", "LastCrawl": {"Status": "SUCCEEDED"}}} ) mocked_boto3.client = MagicMock(return_value=mocked_glueclient) mocked_boto3.resource().Bucket().objects.filter.side_effect = [[], ["a", "b", "c"]] @@ -158,19 +143,14 @@ def test_upload_files(mocker, basic_residential_project_file): / "buildstock.csv" ) # noqa: E501 shutil.copy2( - Path(__file__).parent - / "test_results" - / "housing_characteristics" - / "buildstock.csv", + Path(__file__).parent / "test_results" / "housing_characteristics" / "buildstock.csv", buildstock_csv_path, ) mocker.patch.object(BuildStockBatchBase, "weather_dir", None) mocker.patch.object(BuildStockBatchBase, "output_dir", results_dir) get_dask_client_mock = mocker.patch.object(BuildStockBatchBase, "get_dask_client") mocker.patch.object(BuildStockBatchBase, "results_dir", results_dir) - mocker.patch.object( - BuildStockBatchBase, "CONTAINER_RUNTIME", ContainerRuntime.LOCAL_OPENSTUDIO - ) + mocker.patch.object(BuildStockBatchBase, "CONTAINER_RUNTIME", ContainerRuntime.LOCAL_OPENSTUDIO) bsb = BuildStockBatchBase(project_filename) bsb.process_results() @@ -193,25 +173,13 @@ def test_upload_files(mocker, basic_residential_project_file): if call_function == "create_crawler": crawler_para = call[2] # 2 is for the keyword arguments crawler_created = True - assert ( - crawler_para["DatabaseName"] - == upload_config["postprocessing"]["aws"]["athena"]["database_name"] - ) - assert ( - crawler_para["Role"] - == upload_config["postprocessing"]["aws"]["athena"]["glue_service_role"] - ) + assert crawler_para["DatabaseName"] == upload_config["postprocessing"]["aws"]["athena"]["database_name"] + assert crawler_para["Role"] == upload_config["postprocessing"]["aws"]["athena"]["glue_service_role"] assert crawler_para["TablePrefix"] == OUTPUT_FOLDER_NAME + "_" assert crawler_para["Name"] == db_name + "_" + OUTPUT_FOLDER_NAME assert ( crawler_para["Targets"]["S3Targets"][0]["Path"] - == "s3://" - + s3_bucket - + "/" - + s3_prefix - + "/" - + OUTPUT_FOLDER_NAME - + "/" + == "s3://" + s3_bucket + "/" + s3_prefix + "/" + OUTPUT_FOLDER_NAME + "/" ) if call_function == "start_crawler": assert crawler_created, "crawler attempted to start before creating" @@ -231,23 +199,17 @@ def test_upload_files(mocker, basic_residential_project_file): files_uploaded.remove((source_file_path, s3_file_path)) s3_file_path = s3_path + "upgrades/upgrade=1/results_up01.parquet" - source_file_path = os.path.join( - source_path, "upgrades", "upgrade=1", "results_up01.parquet" - ) + source_file_path = os.path.join(source_path, "upgrades", "upgrade=1", "results_up01.parquet") assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) s3_file_path = s3_path + "timeseries/upgrade=0/group0.parquet" - source_file_path = os.path.join( - source_path, "timeseries", "upgrade=0", "group0.parquet" - ) + source_file_path = os.path.join(source_path, "timeseries", "upgrade=0", "group0.parquet") assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) s3_file_path = s3_path + "timeseries/upgrade=1/group0.parquet" - source_file_path = os.path.join( - source_path, "timeseries", "upgrade=1", "group0.parquet" - ) + source_file_path = os.path.join(source_path, "timeseries", "upgrade=1", "group0.parquet") assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) @@ -266,9 +228,7 @@ def test_upload_files(mocker, basic_residential_project_file): assert (source_file_path, s3_file_path) in files_uploaded files_uploaded.remove((source_file_path, s3_file_path)) - assert ( - len(files_uploaded) == 0 - ), f"These files shouldn't have been uploaded: {files_uploaded}" + assert len(files_uploaded) == 0, f"These files shouldn't have been uploaded: {files_uploaded}" def test_write_parquet_no_index(): @@ -289,9 +249,7 @@ def test_skipping_baseline(basic_residential_project_file): ) sim_output_path = os.path.join(results_dir, "simulation_output") - shutil.rmtree( - os.path.join(sim_output_path, "timeseries", "up00") - ) # remove timeseries results for baseline + shutil.rmtree(os.path.join(sim_output_path, "timeseries", "up00")) # remove timeseries results for baseline # remove results.csv data for baseline from results_jobx.json.gz results_json_filename = os.path.join(sim_output_path, "results_job0.json.gz") @@ -311,21 +269,15 @@ def test_skipping_baseline(basic_residential_project_file): # run postprocessing with patch.object(BuildStockBatchBase, "weather_dir", None), patch.object( BuildStockBatchBase, "get_dask_client" - ) as get_dask_client_mock, patch.object( - BuildStockBatchBase, "results_dir", results_dir - ): + ) as get_dask_client_mock, patch.object(BuildStockBatchBase, "results_dir", results_dir): bsb = BuildStockBatchBase(project_filename) bsb.process_results() get_dask_client_mock.assert_called_once() - up00_parquet = os.path.join( - results_dir, "parquet", "baseline", "results_up00.parquet" - ) + up00_parquet = os.path.join(results_dir, "parquet", "baseline", "results_up00.parquet") assert not os.path.exists(up00_parquet) - up01_parquet = os.path.join( - results_dir, "parquet", "upgrades", "upgrade=1", "results_up01.parquet" - ) + up01_parquet = os.path.join(results_dir, "parquet", "upgrades", "upgrade=1", "results_up01.parquet") assert os.path.exists(up01_parquet) up00_csv_gz = os.path.join(results_dir, "results_csvs", "results_up00.csv.gz") @@ -348,9 +300,7 @@ def test_provide_buildstock_csv(basic_residential_project_file, mocker): sampling_output_csv = bsb.sampler.run_sampling() df2 = read_csv(sampling_output_csv, dtype=str) pd.testing.assert_frame_equal(df, df2) - assert ( - df["Geometry Shared Walls"] == "None" - ).all() # Verify None is being read properly + assert (df["Geometry Shared Walls"] == "None").all() # Verify None is being read properly # Test file missing with open(project_filename, "r") as f: cfg = yaml.safe_load(f) diff --git a/buildstockbatch/test/test_docker_base.py b/buildstockbatch/test/test_docker_base.py index 4ac1a4fb..88e70f1a 100644 --- a/buildstockbatch/test/test_docker_base.py +++ b/buildstockbatch/test/test_docker_base.py @@ -15,9 +15,7 @@ from buildstockbatch.utils import get_project_configuration here = os.path.dirname(os.path.abspath(__file__)) -resources_dir = os.path.join( - here, "test_inputs", "test_openstudio_buildstock", "resources" -) +resources_dir = os.path.join(here, "test_inputs", "test_openstudio_buildstock", "resources") @docker_available @@ -26,15 +24,11 @@ def test_run_batch_prep(basic_residential_project_file, mocker): project_filename, results_dir = basic_residential_project_file() mocker.patch.object(DockerBatchBase, "results_dir", results_dir) - sampler_property_mock = mocker.patch.object( - DockerBatchBase, "sampler", new_callable=PropertyMock - ) + sampler_property_mock = mocker.patch.object(DockerBatchBase, "sampler", new_callable=PropertyMock) sampler_mock = mocker.MagicMock() sampler_property_mock.return_value = sampler_mock # Hard-coded sampling output includes 5 buildings. - sampler_mock.run_sampling = MagicMock( - return_value=os.path.join(resources_dir, "buildstock_good.csv") - ) + sampler_mock.run_sampling = MagicMock(return_value=os.path.join(resources_dir, "buildstock_good.csv")) dbb = DockerBatchBase(project_filename) dbb.batch_array_size = 3 @@ -50,9 +44,9 @@ def test_run_batch_prep(basic_residential_project_file, mocker): # * "G2601210.epw" and "G2601390.epw" are dupes. One should be in # tmppath; one should be copied to the other according to ``epws_to_copy`` assert os.path.isfile(tmppath / "weather" / "G2500210.epw.gz") - assert os.path.isfile( - tmppath / "weather" / "G2601210.epw.gz" - ) or os.path.isfile(tmppath / "weather" / "G2601390.epw.gz") + assert os.path.isfile(tmppath / "weather" / "G2601210.epw.gz") or os.path.isfile( + tmppath / "weather" / "G2601390.epw.gz" + ) src, dest = epws_to_copy[0] assert src in ("G2601210.epw.gz", "G2601390.epw.gz") assert dest in ("G2601210.epw.gz", "G2601390.epw.gz") @@ -65,22 +59,14 @@ def test_run_batch_prep(basic_residential_project_file, mocker): assert batch_info.job_count == 3 jobs_file_path = tmppath / "jobs.tar.gz" with tarfile.open(jobs_file_path, "r") as tar_f: - all_job_files = [ - "jobs", - "jobs/job00000.json", - "jobs/job00001.json", - "jobs/job00002.json", - ] + all_job_files = ["jobs", "jobs/job00000.json", "jobs/job00001.json", "jobs/job00002.json"] assert tar_f.getnames() == all_job_files simulations = [] for filename in all_job_files[1:]: job = json.load(tar_f.extractfile(filename)) assert filename == f"jobs/job{job['job_num']:05d}.json" assert job["n_datapoints"] == 5 # Total number of buildings - assert len(job["batch"]) in ( - 2, - 4, - ) # Number of simulations in this batch + assert len(job["batch"]) in (2, 4) # Number of simulations in this batch simulations.extend(job["batch"]) # Check that all 10 expected simulations are present @@ -102,10 +88,7 @@ def test_get_epws_to_download(): os.makedirs(sim_dir / "lib" / "resources") os.makedirs(sim_dir / "lib" / "housing_characteristics") shutil.copy(options_file, sim_dir / "lib" / "resources") - shutil.copy( - buildstock_file, - sim_dir / "lib" / "housing_characteristics" / "buildstock.csv", - ) + shutil.copy(buildstock_file, sim_dir / "lib" / "housing_characteristics" / "buildstock.csv") jobs_d = { "job_num": 0, @@ -150,15 +133,10 @@ def test_run_simulations(basic_residential_project_file): bucket = temp_path / "bucket" os.makedirs(bucket / "test_prefix" / "results" / "simulation_output") - DockerBatchBase.run_simulations( - cfg, 0, jobs_d, sim_dir, fs, f"{bucket}/test_prefix" - ) + DockerBatchBase.run_simulations(cfg, 0, jobs_d, sim_dir, fs, f"{bucket}/test_prefix") output_dir = bucket / "test_prefix" / "results" / "simulation_output" - assert sorted(os.listdir(output_dir)) == [ - "results_job0.json.gz", - "simulations_job0.tar.gz", - ] + assert sorted(os.listdir(output_dir)) == ["results_job0.json.gz", "simulations_job0.tar.gz"] # Check that buildings 1 and 5 (specified in jobs_d) are in the results with gzip.open(output_dir / "results_job0.json.gz", "r") as f: diff --git a/buildstockbatch/test/test_hpc.py b/buildstockbatch/test/test_hpc.py index e84ca70c..c57a2d88 100644 --- a/buildstockbatch/test/test_hpc.py +++ b/buildstockbatch/test/test_hpc.py @@ -9,13 +9,7 @@ from unittest.mock import patch import gzip -from buildstockbatch.hpc import ( - eagle_cli, - kestrel_cli, - EagleBatch, - KestrelBatch, - SlurmBatch, -) # noqa: F401 +from buildstockbatch.hpc import eagle_cli, kestrel_cli, EagleBatch, KestrelBatch, SlurmBatch # noqa: F401 from buildstockbatch.base import BuildStockBatchBase from buildstockbatch.utils import get_project_configuration, read_csv @@ -25,15 +19,10 @@ @patch("buildstockbatch.hpc.subprocess") def test_hpc_run_building(mock_subprocess, monkeypatch, basic_residential_project_file): tar_filename = ( - pathlib.Path(__file__).resolve().parent - / "test_results" - / "simulation_output" - / "simulations_job0.tar.gz" + pathlib.Path(__file__).resolve().parent / "test_results" / "simulation_output" / "simulations_job0.tar.gz" ) # noqa E501 with tarfile.open(tar_filename, "r") as tarf: - osw_dict = json.loads( - tarf.extractfile("up00/bldg0000001/in.osw").read().decode("utf-8") - ) + osw_dict = json.loads(tarf.extractfile("up00/bldg0000001/in.osw").read().decode("utf-8")) project_filename, results_dir = basic_residential_project_file() tmp_path = pathlib.Path(results_dir).parent @@ -44,9 +33,7 @@ def test_hpc_run_building(mock_subprocess, monkeypatch, basic_residential_projec with patch.object(KestrelBatch, "weather_dir", None), patch.object( KestrelBatch, "create_osw", return_value=osw_dict - ), patch.object( - KestrelBatch, "make_sim_dir", return_value=("bldg0000001up00", sim_path) - ), patch.object( + ), patch.object(KestrelBatch, "make_sim_dir", return_value=("bldg0000001up00", sim_path)), patch.object( KestrelBatch, "local_scratch", tmp_path ): # Normal run @@ -126,15 +113,11 @@ def _test_env_vars_passed(mock_subprocess, hpc_name): @pytest.mark.parametrize("hpc_name", ["eagle", "kestrel"]) def test_user_cli(basic_residential_project_file, monkeypatch, mocker, hpc_name): mock_subprocess = mocker.patch("buildstockbatch.hpc.subprocess") - mock_validate_apptainer_image = mocker.patch( - "buildstockbatch.hpc.SlurmBatch.validate_apptainer_image_hpc" - ) + mock_validate_apptainer_image = mocker.patch("buildstockbatch.hpc.SlurmBatch.validate_apptainer_image_hpc") mock_validate_output_directory = mocker.patch( f"buildstockbatch.hpc.{hpc_name.capitalize()}Batch.validate_output_directory_{hpc_name}" ) - mock_validate_options = mocker.patch( - "buildstockbatch.base.BuildStockBatchBase.validate_options_lookup" - ) + mock_validate_options = mocker.patch("buildstockbatch.base.BuildStockBatchBase.validate_options_lookup") mock_validate_options.return_value = True mock_validate_output_directory.return_value = True @@ -206,15 +189,11 @@ def test_user_cli(basic_residential_project_file, monkeypatch, mocker, hpc_name) @pytest.mark.parametrize("hpc_name", ["eagle", "kestrel"]) -def test_qos_high_job_submit( - basic_residential_project_file, monkeypatch, mocker, hpc_name -): +def test_qos_high_job_submit(basic_residential_project_file, monkeypatch, mocker, hpc_name): mock_subprocess = mocker.patch("buildstockbatch.hpc.subprocess") mock_subprocess.run.return_value.stdout = "Submitted batch job 1\n" mock_subprocess.PIPE = None - mocker.patch.object( - SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" - ) + mocker.patch.object(SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") Batch = eval(f"{hpc_name.capitalize()}Batch") mocker.patch.object(SlurmBatch, "weather_dir", None) project_filename, results_dir = basic_residential_project_file(hpc_name=hpc_name) @@ -246,15 +225,11 @@ def test_qos_high_job_submit( @pytest.mark.parametrize("hpc_name", ["eagle", "kestrel"]) -def test_queue_jobs_minutes_per_sim( - mocker, basic_residential_project_file, monkeypatch, hpc_name -): +def test_queue_jobs_minutes_per_sim(mocker, basic_residential_project_file, monkeypatch, hpc_name): mock_subprocess = mocker.patch("buildstockbatch.hpc.subprocess") Batch = eval(f"{hpc_name.capitalize()}Batch") mocker.patch.object(Batch, "weather_dir", None) - mocker.patch.object( - SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" - ) + mocker.patch.object(SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") mock_subprocess.run.return_value.stdout = "Submitted batch job 1\n" mock_subprocess.PIPE = None project_filename, results_dir = basic_residential_project_file( @@ -305,14 +280,10 @@ def test_run_building_process(mocker, basic_residential_project_file): with open(results_dir / "job001.json", "w") as f: json.dump(job_json, f) - sample_buildstock_csv = pd.DataFrame.from_records( - [{"Building": i, "Dummy Column": i * i} for i in range(10)] - ) + sample_buildstock_csv = pd.DataFrame.from_records([{"Building": i, "Dummy Column": i * i} for i in range(10)]) os.makedirs(results_dir / "housing_characteristics", exist_ok=True) os.makedirs(results_dir / "weather", exist_ok=True) - sample_buildstock_csv.to_csv( - results_dir / "housing_characteristics" / "buildstock.csv", index=False - ) + sample_buildstock_csv.to_csv(results_dir / "housing_characteristics" / "buildstock.csv", index=False) def sequential_parallel(**kwargs): kw2 = kwargs.copy() @@ -323,15 +294,9 @@ def sequential_parallel(**kwargs): rmtree_mock = mocker.patch("buildstockbatch.hpc.shutil.rmtree") mocker.patch("buildstockbatch.hpc.Parallel", sequential_parallel) mocker.patch("buildstockbatch.hpc.subprocess") - mocker.patch.object( - SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" - ) - mocker.patch.object( - KestrelBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir" - ) - mocker.patch.object( - KestrelBatch, "local_weather_dir", results_dir / "local_weather_dir" - ) + mocker.patch.object(SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") + mocker.patch.object(KestrelBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir") + mocker.patch.object(KestrelBatch, "local_weather_dir", results_dir / "local_weather_dir") mocker.patch.object(KestrelBatch, "local_output_dir", results_dir) mocker.patch.object( KestrelBatch, @@ -344,20 +309,14 @@ def sequential_parallel(**kwargs): def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=False): real_upgrade_idx = 0 if upgrade_idx is None else upgrade_idx + 1 sim_id = f"bldg{building_id:07d}up{real_upgrade_idx:02d}" - sim_dir = os.path.join( - base_dir, f"up{real_upgrade_idx:02d}", f"bldg{building_id:07d}" - ) + sim_dir = os.path.join(base_dir, f"up{real_upgrade_idx:02d}", f"bldg{building_id:07d}") return sim_id, sim_dir mocker.patch.object(KestrelBatch, "make_sim_dir", make_sim_dir_mock) - sampler_prop_mock = mocker.patch.object( - KestrelBatch, "sampler", new_callable=mocker.PropertyMock - ) + sampler_prop_mock = mocker.patch.object(KestrelBatch, "sampler", new_callable=mocker.PropertyMock) sampler_mock = mocker.MagicMock() sampler_prop_mock.return_value = sampler_mock - sampler_mock.csv_path = ( - results_dir.parent / "housing_characteristic2" / "buildstock.csv" - ) + sampler_mock.csv_path = results_dir.parent / "housing_characteristic2" / "buildstock.csv" sampler_mock.run_sampling = mocker.MagicMock(return_value="buildstock.csv") b = KestrelBatch(project_filename) @@ -370,19 +329,11 @@ def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=Fal rmtree_mock.assert_any_call(b.local_housing_characteristics_dir) # check results job-json - refrence_path = ( - pathlib.Path(__file__).resolve().parent / "test_results" / "reference_files" - ) + refrence_path = pathlib.Path(__file__).resolve().parent / "test_results" / "reference_files" - refrence_list = json.loads( - gzip.open(refrence_path / "results_job1.json.gz", "r").read() - ) + refrence_list = json.loads(gzip.open(refrence_path / "results_job1.json.gz", "r").read()) - output_list = json.loads( - gzip.open( - results_dir / "simulation_output" / "results_job1.json.gz", "r" - ).read() - ) + output_list = json.loads(gzip.open(results_dir / "simulation_output" / "results_job1.json.gz", "r").read()) refrence_list = [json.dumps(d) for d in refrence_list] output_list = [json.dumps(d) for d in output_list] @@ -392,35 +343,16 @@ def make_sim_dir_mock(building_id, upgrade_idx, base_dir, overwrite_existing=Fal ts_files = list(refrence_path.glob("**/*.parquet")) def compare_ts_parquets(source, dst): - test_pq = ( - pd.read_parquet(source) - .reset_index() - .drop(columns=["index"]) - .rename(columns=str.lower) - ) - reference_pq = ( - pd.read_parquet(dst) - .reset_index() - .drop(columns=["index"]) - .rename(columns=str.lower) - ) + test_pq = pd.read_parquet(source).reset_index().drop(columns=["index"]).rename(columns=str.lower) + reference_pq = pd.read_parquet(dst).reset_index().drop(columns=["index"]).rename(columns=str.lower) pd.testing.assert_frame_equal(test_pq, reference_pq) for file in ts_files: - results_file = ( - results_dir - / "results" - / "simulation_output" - / "timeseries" - / file.parent.name - / file.name - ) + results_file = results_dir / "results" / "simulation_output" / "timeseries" / file.parent.name / file.name compare_ts_parquets(file, results_file) # Check that buildstock.csv was trimmed properly - local_buildstock_df = read_csv( - results_dir / "local_housing_characteristics_dir" / "buildstock.csv", dtype=str - ) + local_buildstock_df = read_csv(results_dir / "local_housing_characteristics_dir" / "buildstock.csv", dtype=str) unique_buildings = {str(x[0]) for x in job_json["batch"]} assert len(unique_buildings) == len(local_buildstock_df) assert unique_buildings == set(local_buildstock_df["Building"]) @@ -434,15 +366,11 @@ def test_run_building_error_caught(mocker, basic_residential_project_file): with open(results_dir / "job001.json", "w") as f: json.dump(job_json, f) - sample_buildstock_csv = pd.DataFrame.from_records( - [{"Building": i, "Dummy Column": i * i} for i in range(10)] - ) + sample_buildstock_csv = pd.DataFrame.from_records([{"Building": i, "Dummy Column": i * i} for i in range(10)]) os.makedirs(results_dir / "housing_characteristics", exist_ok=True) os.makedirs(results_dir / "local_housing_characteristics", exist_ok=True) os.makedirs(results_dir / "weather", exist_ok=True) - sample_buildstock_csv.to_csv( - results_dir / "housing_characteristics" / "buildstock.csv", index=False - ) + sample_buildstock_csv.to_csv(results_dir / "housing_characteristics" / "buildstock.csv", index=False) def raise_error(*args, **kwargs): raise RuntimeError("A problem happened") @@ -456,18 +384,12 @@ def sequential_parallel(**kwargs): mocker.patch("buildstockbatch.hpc.shutil.rmtree") mocker.patch("buildstockbatch.hpc.Parallel", sequential_parallel) mocker.patch("buildstockbatch.hpc.subprocess") - mocker.patch.object( - SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" - ) + mocker.patch.object(SlurmBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") mocker.patch.object(KestrelBatch, "run_building", raise_error) mocker.patch.object(KestrelBatch, "local_output_dir", results_dir) mocker.patch.object(KestrelBatch, "results_dir", results_dir) - mocker.patch.object( - KestrelBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir" - ) - mocker.patch.object( - KestrelBatch, "local_weather_dir", results_dir / "local_weather_dir" - ) + mocker.patch.object(KestrelBatch, "local_buildstock_dir", results_dir / "local_buildstock_dir") + mocker.patch.object(KestrelBatch, "local_weather_dir", results_dir / "local_weather_dir") mocker.patch.object( KestrelBatch, "local_housing_characteristics_dir", @@ -491,15 +413,9 @@ def test_rerun_failed_jobs(mocker, basic_residential_project_file): mocker.patch.object(KestrelBatch, "weather_dir", None) mocker.patch.object(KestrelBatch, "results_dir", results_dir) process_results_mocker = mocker.patch.object(BuildStockBatchBase, "process_results") - queue_jobs_mocker = mocker.patch.object( - KestrelBatch, "queue_jobs", return_value=[42] - ) - queue_post_processing_mocker = mocker.patch.object( - KestrelBatch, "queue_post_processing" - ) - mocker.patch.object( - KestrelBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif" - ) + queue_jobs_mocker = mocker.patch.object(KestrelBatch, "queue_jobs", return_value=[42]) + queue_post_processing_mocker = mocker.patch.object(KestrelBatch, "queue_post_processing") + mocker.patch.object(KestrelBatch, "get_apptainer_image", return_value="/path/to/openstudio.sif") b = KestrelBatch(project_filename) diff --git a/buildstockbatch/test/test_local.py b/buildstockbatch/test/test_local.py index bf01d0c3..6c409598 100644 --- a/buildstockbatch/test/test_local.py +++ b/buildstockbatch/test/test_local.py @@ -44,11 +44,7 @@ def test_resstock_local_batch(project_filename): n_datapoints = 2 batch.cfg["sampler"]["args"]["n_datapoints"] = n_datapoints - local_weather_file = ( - resstock_directory.parent - / "weather" - / batch.cfg["weather_files_url"].split("/")[-1] - ) + local_weather_file = resstock_directory.parent / "weather" / batch.cfg["weather_files_url"].split("/")[-1] if local_weather_file.exists(): del batch.cfg["weather_files_url"] batch.cfg["weather_files_path"] = str(local_weather_file) @@ -63,12 +59,7 @@ def test_resstock_local_batch(project_filename): for upgrade_id in range(0, n_upgrades + 1): for bldg_id in range(1, n_datapoints + 1): - assert ( - simout_path - / "timeseries" - / f"up{upgrade_id:02d}" - / f"bldg{bldg_id:07d}.parquet" - ).exists() + assert (simout_path / "timeseries" / f"up{upgrade_id:02d}" / f"bldg{bldg_id:07d}.parquet").exists() batch.process_results() @@ -77,9 +68,7 @@ def test_resstock_local_batch(project_filename): assert (simout_path / "simulations_job0.tar.gz").exists() base_pq = out_path / "parquet" / "baseline" / "results_up00.parquet" assert base_pq.exists() - base = pd.read_parquet( - base_pq, columns=["completed_status", "started_at", "completed_at"] - ) + base = pd.read_parquet(base_pq, columns=["completed_status", "started_at", "completed_at"]) assert (base["completed_status"] == "Success").all() assert base.dtypes["started_at"] == "timestamp[s][pyarrow]" assert base.dtypes["completed_at"] == "timestamp[s][pyarrow]" @@ -92,17 +81,9 @@ def test_resstock_local_batch(project_filename): tsdf = pd.read_parquet(ts_pq_filename, columns=ts_time_cols) for col in tsdf.columns: assert tsdf[col].dtype == "timestamp[s][pyarrow]" - assert ( - out_path / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz" - ).exists() + assert (out_path / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz").exists() if upgrade_id >= 1: - upg_pq = ( - out_path - / "parquet" - / "upgrades" - / f"upgrade={upgrade_id}" - / f"results_up{upgrade_id:02d}.parquet" - ) + upg_pq = out_path / "parquet" / "upgrades" / f"upgrade={upgrade_id}" / f"results_up{upgrade_id:02d}.parquet" assert upg_pq.exists() upg = pd.read_parquet(upg_pq, columns=["completed_status"]) assert (upg["completed_status"] == "Success").all() @@ -122,9 +103,7 @@ def mocked_subprocess_run(run_cmd, **kwargs): mocker.patch("buildstockbatch.local.subprocess.run", mocked_subprocess_run) sleep_mock = mocker.patch("buildstockbatch.local.time.sleep") - cfg = get_project_configuration( - resstock_directory / "project_national" / "national_baseline.yml" - ) + cfg = get_project_configuration(resstock_directory / "project_national" / "national_baseline.yml") cfg["max_minutes_per_sim"] = 5 with tempfile.TemporaryDirectory() as tmpdir: @@ -153,9 +132,7 @@ def mocked_subprocess_run(run_cmd, **kwargs): assert out_osw["completed_status"] == "Fail" assert msg_re.search(out_osw["timeout"]) - err_log_re = re.compile( - r"\[\d\d:\d\d:\d\d ERROR\] Terminated \w+ after reaching max time" - ) + err_log_re = re.compile(r"\[\d\d:\d\d:\d\d ERROR\] Terminated \w+ after reaching max time") with open(sim_path / "run" / "run.log", "r") as run_log: err_log_re.search(run_log.read()) with open(sim_path / "run" / "failed.job", "r") as failed_job: diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py index d11fdb86..667faa7f 100644 --- a/buildstockbatch/test/test_postprocessing.py +++ b/buildstockbatch/test/test_postprocessing.py @@ -19,9 +19,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): reporting_measures = ["ReportingMeasure1", "ReportingMeasure2"] - project_filename, results_dir = basic_residential_project_file( - {"reporting_measures": reporting_measures} - ) + project_filename, results_dir = basic_residential_project_file({"reporting_measures": reporting_measures}) fs = LocalFileSystem() @@ -42,11 +40,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): sim_dir = str(filename.parent.parent) upgrade_id = int(re.search(r"up(\d+)", sim_dir).group(1)) building_id = int(re.search(r"bldg(\d+)", sim_dir).group(1)) - dpouts2.append( - postprocessing.read_simulation_outputs( - fs, reporting_measures, sim_dir, upgrade_id, building_id - ) - ) + dpouts2.append(postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, building_id)) with gzip.open(sim_out_dir / "results_job0.json.gz", "wt", encoding="utf-8") as f: json.dump(dpouts2, f) @@ -56,9 +50,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False) for upgrade_id in (0, 1): - df = read_csv( - str(results_dir / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz") - ) + df = read_csv(str(results_dir / "results_csvs" / f"results_up{upgrade_id:02d}.csv.gz")) assert (df["reporting_measure1.column_1"] == 1).all() assert (df["reporting_measure1.column_2"] == 2).all() assert (df["reporting_measure2.column_3"] == 3).all() @@ -74,9 +66,7 @@ def test_empty_results_assertion(basic_residential_project_file, capsys): shutil.rmtree(sim_out_dir) # no results cfg = get_project_configuration(project_filename) - with pytest.raises( - ValueError, match=r"No simulation results found to post-process" - ): + with pytest.raises(ValueError, match=r"No simulation results found to post-process"): assert postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False) @@ -96,9 +86,7 @@ def test_large_parquet_combine(basic_residential_project_file): @pytest.mark.parametrize("keep_individual_timeseries", [True, False]) -def test_keep_individual_timeseries( - keep_individual_timeseries, basic_residential_project_file, mocker -): +def test_keep_individual_timeseries(keep_individual_timeseries, basic_residential_project_file, mocker): project_filename, results_dir = basic_residential_project_file( {"postprocessing": {"keep_individual_timeseries": keep_individual_timeseries}} ) @@ -122,9 +110,7 @@ def test_upgrade_missing_ts(basic_residential_project_file, mocker, caplog): project_filename, results_dir = basic_residential_project_file() results_path = pathlib.Path(results_dir) - for filename in (results_path / "simulation_output" / "timeseries" / "up01").glob( - "*.parquet" - ): + for filename in (results_path / "simulation_output" / "timeseries" / "up01").glob("*.parquet"): os.remove(filename) mocker.patch.object(BuildStockBatchBase, "weather_dir", None) diff --git a/buildstockbatch/test/test_validation.py b/buildstockbatch/test/test_validation.py index 92434751..ee915940 100644 --- a/buildstockbatch/test/test_validation.py +++ b/buildstockbatch/test/test_validation.py @@ -34,9 +34,7 @@ here = os.path.dirname(os.path.abspath(__file__)) example_yml_dir = os.path.join(here, "test_inputs") -resources_dir = os.path.join( - here, "test_inputs", "test_openstudio_buildstock", "resources" -) +resources_dir = os.path.join(here, "test_inputs", "test_openstudio_buildstock", "resources") def filter_logs(logs, level): @@ -69,15 +67,11 @@ def test_aws_batch_validation_is_static(): def test_complete_schema_passes_validation(): - assert BuildStockBatchBase.validate_project_schema( - os.path.join(example_yml_dir, "complete-schema.yml") - ) + assert BuildStockBatchBase.validate_project_schema(os.path.join(example_yml_dir, "complete-schema.yml")) def test_minimal_schema_passes_validation(): - assert BuildStockBatchBase.validate_project_schema( - os.path.join(example_yml_dir, "minimal-schema.yml") - ) + assert BuildStockBatchBase.validate_project_schema(os.path.join(example_yml_dir, "minimal-schema.yml")) @pytest.mark.parametrize( @@ -135,13 +129,9 @@ def test_xor_violations_fail(project_file, expected): ) def test_validation_integration(project_file, base_expected, eagle_expected): # patch the validate_options_lookup function to always return true for this case - with patch.object( - BuildStockBatchBase, "validate_options_lookup", lambda _: True - ), patch.object( + with patch.object(BuildStockBatchBase, "validate_options_lookup", lambda _: True), patch.object( BuildStockBatchBase, "validate_measure_references", lambda _: True - ), patch.object( - BuildStockBatchBase, "validate_workflow_generator", lambda _: True - ), patch.object( + ), patch.object(BuildStockBatchBase, "validate_workflow_generator", lambda _: True), patch.object( BuildStockBatchBase, "validate_postprocessing_spec", lambda _: True ), patch.object( SlurmBatch, "validate_apptainer_image_hpc", lambda _: True @@ -192,14 +182,10 @@ def test_bad_measures(project_file): except (ValidationError, YamaleError) as er: er = str(er) assert "'1.5' is not a int" in er - assert ( - "'huorly' not in ('none', 'timestep', 'hourly', 'daily', 'monthly')" - in er - ) + assert "'huorly' not in ('none', 'timestep', 'hourly', 'daily', 'monthly')" in er else: raise Exception( - "measures_and_arguments was supposed to raise ValidationError for" - " enforce-validate-measures-bad.yml" + "measures_and_arguments was supposed to raise ValidationError for" " enforce-validate-measures-bad.yml" ) @@ -207,9 +193,7 @@ def test_bad_measures(project_file): "project_file", [ os.path.join(example_yml_dir, "enforce-validate-measures-good-2.yml"), - os.path.join( - example_yml_dir, "enforce-validate-measures-good-2-with-anchors.yml" - ), + os.path.join(example_yml_dir, "enforce-validate-measures-good-2-with-anchors.yml"), ], ) def test_good_measures(project_file): @@ -274,9 +258,7 @@ def test_bad_options_validation(project_file): assert "Floor Insulation: '*' cannot be mixed with other options" in er else: - raise Exception( - "validate_options was supposed to raise ValueError for enforce-validate-options-bad.yml" - ) + raise Exception("validate_options was supposed to raise ValueError for enforce-validate-options-bad.yml") @pytest.mark.parametrize( @@ -307,8 +289,7 @@ def test_bad_measures_validation(project_file): else: raise Exception( - "validate_measure_references was supposed to raise ValueError for " - "enforce-validate-measures-bad.yml" + "validate_measure_references was supposed to raise ValueError for " "enforce-validate-measures-bad.yml" ) @@ -325,14 +306,10 @@ def test_bad_postprocessing_spec_validation(project_file): er = str(er) assert "bad_partition_column" in er else: - raise Exception( - "validate_options was supposed to raise ValidationError for enforce-validate-options-bad-2.yml" - ) + raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-bad-2.yml") -@pytest.mark.parametrize( - "project_file", [os.path.join(example_yml_dir, "enforce-validate-options-good.yml")] -) +@pytest.mark.parametrize("project_file", [os.path.join(example_yml_dir, "enforce-validate-options-good.yml")]) def test_logic_validation_fail(project_file): try: BuildStockBatchBase.validate_logic(project_file) @@ -342,9 +319,7 @@ def test_logic_validation_fail(project_file): assert "'Vintage' occurs 2 times in a 'and' block" in er assert "'Vintage' occurs 2 times in a '&&' block" in er else: - raise Exception( - "validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml" - ) + raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml") @pytest.mark.parametrize( @@ -360,9 +335,7 @@ def test_number_of_options_apply_upgrade(): proj_filename = resstock_directory / "project_national" / "national_upgrades.yml" cfg = get_project_configuration(str(proj_filename)) cfg["upgrades"][-1]["options"] = cfg["upgrades"][-1]["options"] * 10 - cfg["upgrades"][0]["options"][0]["costs"] = ( - cfg["upgrades"][0]["options"][0]["costs"] * 5 - ) + cfg["upgrades"][0]["options"][0]["costs"] = cfg["upgrades"][0]["options"][0]["costs"] * 5 with tempfile.TemporaryDirectory() as tmpdir: tmppath = pathlib.Path(tmpdir) new_proj_filename = tmppath / "project.yml" @@ -464,15 +437,11 @@ def test_validate_apptainer_image(): temp_yml = pathlib.Path(tmpdir, "temp.yml") with open(temp_yml, "w") as f: yaml.dump(cfg, f, Dumper=yaml.SafeDumper) - with pytest.raises( - ValidationError, - match=r"Could not find apptainer image: .+\.sif or .+\.simg", - ): + with pytest.raises(ValidationError, match=r"Could not find apptainer image: .+\.sif or .+\.simg"): SlurmBatch.validate_apptainer_image_hpc(str(temp_yml)) for ext in ["Apptainer.sif", "Singularity.simg"]: filename = pathlib.Path( - tmpdir, - f"OpenStudio-{SlurmBatch.DEFAULT_OS_VERSION}.{SlurmBatch.DEFAULT_OS_SHA}-{ext}", + tmpdir, f"OpenStudio-{SlurmBatch.DEFAULT_OS_VERSION}.{SlurmBatch.DEFAULT_OS_SHA}-{ext}" ) filename.touch() SlurmBatch.validate_apptainer_image_hpc(str(temp_yml)) @@ -484,11 +453,7 @@ def test_validate_sampler_good_buildstock(basic_residential_project_file): { "sampler": { "type": "precomputed", - "args": { - "sample_file": str( - os.path.join(resources_dir, "buildstock_good.csv") - ) - }, + "args": {"sample_file": str(os.path.join(resources_dir, "buildstock_good.csv"))}, } } ) @@ -500,11 +465,7 @@ def test_validate_sampler_bad_buildstock(basic_residential_project_file): { "sampler": { "type": "precomputed", - "args": { - "sample_file": str( - os.path.join(resources_dir, "buildstock_bad.csv") - ) - }, + "args": {"sample_file": str(os.path.join(resources_dir, "buildstock_bad.csv"))}, } } ) @@ -512,27 +473,10 @@ def test_validate_sampler_bad_buildstock(basic_residential_project_file): BuildStockBatchBase.validate_sampler(project_filename) except ValidationError as er: er = str(er) - assert ( - "Option 1940-1950 in column Vintage of buildstock_csv is not available in options_lookup.tsv" - in er - ) - assert ( - "Option TX in column State of buildstock_csv is not available in options_lookup.tsv" - in er - ) - assert ( - "Option nan in column Insulation Wall of buildstock_csv is not available in options_lookup.tsv" - in er - ) - assert ( - "Column Insulation in buildstock_csv is not available in options_lookup.tsv" - in er - ) - assert ( - "Column ZipPlusCode in buildstock_csv is not available in options_lookup.tsv" - in er - ) + assert "Option 1940-1950 in column Vintage of buildstock_csv is not available in options_lookup.tsv" in er + assert "Option TX in column State of buildstock_csv is not available in options_lookup.tsv" in er + assert "Option nan in column Insulation Wall of buildstock_csv is not available in options_lookup.tsv" in er + assert "Column Insulation in buildstock_csv is not available in options_lookup.tsv" in er + assert "Column ZipPlusCode in buildstock_csv is not available in options_lookup.tsv" in er else: - raise Exception( - "validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml" - ) + raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml") diff --git a/buildstockbatch/utils.py b/buildstockbatch/utils.py index b975959b..e9453b38 100644 --- a/buildstockbatch/utils.py +++ b/buildstockbatch/utils.py @@ -46,16 +46,12 @@ def get_project_configuration(project_file): raise err # Set absolute paths - cfg["buildstock_directory"] = path_rel_to_file( - project_file, cfg["buildstock_directory"] - ) + cfg["buildstock_directory"] = path_rel_to_file(project_file, cfg["buildstock_directory"]) # if 'precomputed_sample' in cfg.get('baseline', {}): # cfg['baseline']['precomputed_sample'] = \ # path_rel_to_file(project_file, cfg['baseline']['precomputed_sample']) if "weather_files_path" in cfg: - cfg["weather_files_path"] = path_rel_to_file( - project_file, cfg["weather_files_path"] - ) + cfg["weather_files_path"] = path_rel_to_file(project_file, cfg["weather_files_path"]) return cfg @@ -69,35 +65,20 @@ def _str_repr(obj, list_max=20, dict_max=20, string_max=100): elif type(obj) in [int, float]: return _str_repr(str(obj), list_max, dict_max, string_max) elif type(obj) is list: - txt = "[" + ",".join( - [ - _str_repr(item, list_max, dict_max, string_max) - for item in obj[0:list_max] - ] - ) + txt = "[" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:list_max]]) if len(obj) > list_max: txt += f" ...{len(obj)}" txt += "]" return txt elif type(obj) is tuple: - txt = "(" + ",".join( - [ - _str_repr(item, list_max, dict_max, string_max) - for item in obj[0:list_max] - ] - ) + txt = "(" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:list_max]]) if len(obj) > list_max: txt += f" ...{len(obj)}" txt += ")" return txt elif type(obj) is set: obj = list(obj) - txt = "{" + ",".join( - [ - _str_repr(item, list_max, dict_max, string_max) - for item in obj[0:dict_max] - ] - ) + txt = "{" + ",".join([_str_repr(item, list_max, dict_max, string_max) for item in obj[0:dict_max]]) if len(obj) > dict_max: txt += f" ...{len(obj)}" txt += "}" diff --git a/buildstockbatch/workflow_generator/commercial.py b/buildstockbatch/workflow_generator/commercial.py index 2fff78a2..6495acfe 100644 --- a/buildstockbatch/workflow_generator/commercial.py +++ b/buildstockbatch/workflow_generator/commercial.py @@ -49,9 +49,7 @@ def validate(cls, cfg): workflow_generator_args = cfg["workflow_generator"]["args"] schema_yml = re.sub(r"^ {8}", "", schema_yml, flags=re.MULTILINE) schema = yamale.make_schema(content=schema_yml, parser="ruamel") - data = yamale.make_data( - content=json.dumps(workflow_generator_args), parser="ruamel" - ) + data = yamale.make_data(content=json.dumps(workflow_generator_args), parser="ruamel") return yamale.validate(schema, data, strict=True) def reporting_measures(self): @@ -116,17 +114,11 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "arguments": {"run_measure": 1}, } if "upgrade_name" in measure_d: - apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d[ - "upgrade_name" - ] + apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d["upgrade_name"] for opt_num, option in enumerate(measure_d["options"], 1): - apply_upgrade_measure["arguments"][ - "option_{}".format(opt_num) - ] = option["option"] + apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = option["option"] if "lifetime" in option: - apply_upgrade_measure["arguments"][ - "option_{}_lifetime".format(opt_num) - ] = option["lifetime"] + apply_upgrade_measure["arguments"]["option_{}_lifetime".format(opt_num)] = option["lifetime"] if "apply_logic" in option: apply_upgrade_measure["arguments"][ "option_{}_apply_logic".format(opt_num) @@ -139,9 +131,9 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) ] = cost[arg] if "package_apply_logic" in measure_d: - apply_upgrade_measure["arguments"][ - "package_apply_logic" - ] = self.make_apply_logic_arg(measure_d["package_apply_logic"]) + apply_upgrade_measure["arguments"]["package_apply_logic"] = self.make_apply_logic_arg( + measure_d["package_apply_logic"] + ) build_existing_model_idx = list( map( diff --git a/buildstockbatch/workflow_generator/residential_hpxml.py b/buildstockbatch/workflow_generator/residential_hpxml.py index ee71b6a1..71cab179 100644 --- a/buildstockbatch/workflow_generator/residential_hpxml.py +++ b/buildstockbatch/workflow_generator/residential_hpxml.py @@ -145,18 +145,14 @@ def validate(cls, cfg): workflow_generator_args = cfg["workflow_generator"]["args"] schema_yml = re.sub(r"^ {8}", "", schema_yml, flags=re.MULTILINE) schema = yamale.make_schema(content=schema_yml, parser="ruamel") - data = yamale.make_data( - content=json.dumps(workflow_generator_args), parser="ruamel" - ) + data = yamale.make_data(content=json.dumps(workflow_generator_args), parser="ruamel") yamale.validate(schema, data, strict=True) return cls.validate_measures_and_arguments(cfg) def reporting_measures(self): """Return a list of reporting measures to include in outputs""" workflow_args = self.cfg["workflow_generator"].get("args", {}) - return [ - x["measure_dir_name"] for x in workflow_args.get("reporting_measures", []) - ] + return [x["measure_dir_name"] for x in workflow_args.get("reporting_measures", [])] @staticmethod def validate_measures_and_arguments(cfg): @@ -195,9 +191,7 @@ def get_cfg_path(cfg_path): workflow_args = cfg["workflow_generator"].get("args", {}) if "reporting_measures" in workflow_args.keys(): for reporting_measure in workflow_args["reporting_measures"]: - measure_names[ - reporting_measure["measure_dir_name"] - ] = "workflow_generator.args.reporting_measures" + measure_names[reporting_measure["measure_dir_name"]] = "workflow_generator.args.reporting_measures" error_msgs = "" warning_msgs = "" @@ -230,9 +224,7 @@ def get_cfg_path(cfg_path): error_msgs += "* The following multipliers values are invalid: \n" for multiplier, count in invalid_multipliers.items(): error_msgs += f" '{multiplier}' - Used {count} times \n" - error_msgs += ( - f" The list of valid multipliers are {valid_multipliers}.\n" - ) + error_msgs += f" The list of valid multipliers are {valid_multipliers}.\n" if warning_msgs: logger.warning(warning_msgs) @@ -274,8 +266,7 @@ def create_osw(self, sim_id, building_id, upgrade_idx): bld_exist_model_args = { "building_id": building_id, - "sample_weight": self.cfg["baseline"]["n_buildings_represented"] - / self.n_datapoints, + "sample_weight": self.cfg["baseline"]["n_buildings_represented"] / self.n_datapoints, } bld_exist_model_args.update(sim_ctl_args) @@ -298,16 +289,12 @@ def create_osw(self, sim_id, building_id, upgrade_idx): ["emissions_wood_values", "wood_value"], ] for arg, item in emissions_map: - bld_exist_model_args[arg] = ",".join( - [str(s.get(item, "")) for s in emissions] - ) + bld_exist_model_args[arg] = ",".join([str(s.get(item, "")) for s in emissions]) buildstock_dir = self.cfg["buildstock_directory"] measures_dir = os.path.join(buildstock_dir, "measures") measure_path = os.path.join(measures_dir, "BuildExistingModel") - bld_exist_model_args_avail = get_measure_arguments( - os.path.join(measure_path, "measure.xml") - ) + bld_exist_model_args_avail = get_measure_arguments(os.path.join(measure_path, "measure.xml")) if "utility_bills" in workflow_args: utility_bills = workflow_args["utility_bills"] @@ -346,9 +333,7 @@ def create_osw(self, sim_id, building_id, upgrade_idx): ] for arg, item in utility_bills_map: if arg in bld_exist_model_args_avail: - bld_exist_model_args[arg] = ",".join( - [str(s.get(item, "")) for s in utility_bills] - ) + bld_exist_model_args[arg] = ",".join([str(s.get(item, "")) for s in utility_bills]) sim_out_rep_args = { "timeseries_frequency": "none", @@ -371,9 +356,7 @@ def create_osw(self, sim_id, building_id, upgrade_idx): measures_dir = os.path.join(buildstock_dir, "resources/hpxml-measures") measure_path = os.path.join(measures_dir, "ReportSimulationOutput") - sim_out_rep_args_avail = get_measure_arguments( - os.path.join(measure_path, "measure.xml") - ) + sim_out_rep_args_avail = get_measure_arguments(os.path.join(measure_path, "measure.xml")) if "include_annual_total_consumptions" in sim_out_rep_args_avail: sim_out_rep_args["include_annual_total_consumptions"] = True @@ -436,18 +419,14 @@ def create_osw(self, sim_id, building_id, upgrade_idx): if "output_variables" in sim_out_rep_args: output_variables = sim_out_rep_args["output_variables"] - sim_out_rep_args["user_output_variables"] = ",".join( - [str(s.get("name")) for s in output_variables] - ) + sim_out_rep_args["user_output_variables"] = ",".join([str(s.get("name")) for s in output_variables]) sim_out_rep_args.pop("output_variables") util_bills_rep_args = {} measures_dir = os.path.join(buildstock_dir, "resources/hpxml-measures") measure_path = os.path.join(measures_dir, "ReportUtilityBills") - util_bills_rep_args_avail = get_measure_arguments( - os.path.join(measure_path, "measure.xml") - ) + util_bills_rep_args_avail = get_measure_arguments(os.path.join(measure_path, "measure.xml")) if "include_annual_bills" in util_bills_rep_args_avail: util_bills_rep_args["include_annual_bills"] = True @@ -538,17 +517,11 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "arguments": {"run_measure": 1}, } if "upgrade_name" in measure_d: - apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d[ - "upgrade_name" - ] + apply_upgrade_measure["arguments"]["upgrade_name"] = measure_d["upgrade_name"] for opt_num, option in enumerate(measure_d["options"], 1): - apply_upgrade_measure["arguments"][ - "option_{}".format(opt_num) - ] = option["option"] + apply_upgrade_measure["arguments"]["option_{}".format(opt_num)] = option["option"] if "lifetime" in option: - apply_upgrade_measure["arguments"][ - "option_{}_lifetime".format(opt_num) - ] = option["lifetime"] + apply_upgrade_measure["arguments"]["option_{}_lifetime".format(opt_num)] = option["lifetime"] if "apply_logic" in option: apply_upgrade_measure["arguments"][ "option_{}_apply_logic".format(opt_num) @@ -561,13 +534,11 @@ def create_osw(self, sim_id, building_id, upgrade_idx): "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) ] = cost[arg] if "package_apply_logic" in measure_d: - apply_upgrade_measure["arguments"][ - "package_apply_logic" - ] = self.make_apply_logic_arg(measure_d["package_apply_logic"]) + apply_upgrade_measure["arguments"]["package_apply_logic"] = self.make_apply_logic_arg( + measure_d["package_apply_logic"] + ) - build_existing_model_idx = [ - x["measure_dir_name"] == "BuildExistingModel" for x in osw["steps"] - ].index(True) + build_existing_model_idx = [x["measure_dir_name"] == "BuildExistingModel" for x in osw["steps"]].index(True) osw["steps"].insert(build_existing_model_idx + 1, apply_upgrade_measure) if "reporting_measures" in workflow_args: @@ -575,8 +546,6 @@ def create_osw(self, sim_id, building_id, upgrade_idx): if "arguments" not in reporting_measure: reporting_measure["arguments"] = {} reporting_measure["measure_type"] = "ReportingMeasure" - osw["steps"].insert( - -1, reporting_measure - ) # right before ServerDirectoryCleanup + osw["steps"].insert(-1, reporting_measure) # right before ServerDirectoryCleanup return osw diff --git a/buildstockbatch/workflow_generator/test_workflow_generator.py b/buildstockbatch/workflow_generator/test_workflow_generator.py index 9a49eaea..bd61c46a 100644 --- a/buildstockbatch/workflow_generator/test_workflow_generator.py +++ b/buildstockbatch/workflow_generator/test_workflow_generator.py @@ -12,14 +12,10 @@ def test_apply_logic_recursion(): apply_logic = WorkflowGeneratorBase.make_apply_logic_arg(["one", "two", "three"]) assert apply_logic == "(one&&two&&three)" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg( - {"and": ["one", "two", "three"]} - ) + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"and": ["one", "two", "three"]}) assert apply_logic == "(one&&two&&three)" - apply_logic = WorkflowGeneratorBase.make_apply_logic_arg( - {"or": ["four", "five", "six"]} - ) + apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"or": ["four", "five", "six"]}) assert apply_logic == "(four||five||six)" apply_logic = WorkflowGeneratorBase.make_apply_logic_arg({"not": "seven"}) @@ -76,36 +72,11 @@ def test_residential_hpxml(mocker): build_existing_model_step = steps[0] assert build_existing_model_step["measure_dir_name"] == "BuildExistingModel" - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_begin_month" - ] - == 2 - ) - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_begin_day_of_month" - ] - == 1 - ) - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_end_month" - ] - == 2 - ) - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_end_day_of_month" - ] - == 28 - ) - assert ( - build_existing_model_step["arguments"][ - "simulation_control_run_period_calendar_year" - ] - == 2010 - ) + assert build_existing_model_step["arguments"]["simulation_control_run_period_begin_month"] == 2 + assert build_existing_model_step["arguments"]["simulation_control_run_period_begin_day_of_month"] == 1 + assert build_existing_model_step["arguments"]["simulation_control_run_period_end_month"] == 2 + assert build_existing_model_step["arguments"]["simulation_control_run_period_end_day_of_month"] == 28 + assert build_existing_model_step["arguments"]["simulation_control_run_period_calendar_year"] == 2010 apply_upgrade_step = steps[1] assert apply_upgrade_step["measure_dir_name"] == "ApplyUpgrade" @@ -116,25 +87,13 @@ def test_residential_hpxml(mocker): simulation_output_step = steps[3] assert simulation_output_step["measure_dir_name"] == "ReportSimulationOutput" assert simulation_output_step["arguments"]["timeseries_frequency"] == "hourly" - assert ( - simulation_output_step["arguments"]["include_annual_total_consumptions"] is True - ) - assert ( - simulation_output_step["arguments"]["include_annual_fuel_consumptions"] is True - ) - assert ( - simulation_output_step["arguments"]["include_annual_end_use_consumptions"] - is True - ) - assert ( - simulation_output_step["arguments"]["include_annual_system_use_consumptions"] - is False - ) + assert simulation_output_step["arguments"]["include_annual_total_consumptions"] is True + assert simulation_output_step["arguments"]["include_annual_fuel_consumptions"] is True + assert simulation_output_step["arguments"]["include_annual_end_use_consumptions"] is True + assert simulation_output_step["arguments"]["include_annual_system_use_consumptions"] is False assert simulation_output_step["arguments"]["include_annual_emissions"] is True assert simulation_output_step["arguments"]["include_annual_emission_fuels"] is True - assert ( - simulation_output_step["arguments"]["include_annual_emission_end_uses"] is True - ) + assert simulation_output_step["arguments"]["include_annual_emission_end_uses"] is True assert simulation_output_step["arguments"]["include_annual_total_loads"] is True assert simulation_output_step["arguments"]["include_annual_unmet_hours"] is True assert simulation_output_step["arguments"]["include_annual_peak_fuels"] is True @@ -143,55 +102,22 @@ def test_residential_hpxml(mocker): assert simulation_output_step["arguments"]["include_annual_hot_water_uses"] is True assert simulation_output_step["arguments"]["include_annual_hvac_summary"] is True assert simulation_output_step["arguments"]["include_annual_resilience"] is True - assert ( - simulation_output_step["arguments"]["include_timeseries_total_consumptions"] - is True - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_fuel_consumptions"] - is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_end_use_consumptions"] - is True - ) - assert ( - simulation_output_step["arguments"][ - "include_timeseries_system_use_consumptions" - ] - is False - ) + assert simulation_output_step["arguments"]["include_timeseries_total_consumptions"] is True + assert simulation_output_step["arguments"]["include_timeseries_fuel_consumptions"] is False + assert simulation_output_step["arguments"]["include_timeseries_end_use_consumptions"] is True + assert simulation_output_step["arguments"]["include_timeseries_system_use_consumptions"] is False assert simulation_output_step["arguments"]["include_timeseries_emissions"] is False - assert ( - simulation_output_step["arguments"]["include_timeseries_emission_fuels"] - is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_emission_end_uses"] - is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_hot_water_uses"] - is False - ) + assert simulation_output_step["arguments"]["include_timeseries_emission_fuels"] is False + assert simulation_output_step["arguments"]["include_timeseries_emission_end_uses"] is False + assert simulation_output_step["arguments"]["include_timeseries_hot_water_uses"] is False assert simulation_output_step["arguments"]["include_timeseries_total_loads"] is True - assert ( - simulation_output_step["arguments"]["include_timeseries_component_loads"] - is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_unmet_hours"] is False - ) - assert ( - simulation_output_step["arguments"]["include_timeseries_zone_temperatures"] - is False - ) + assert simulation_output_step["arguments"]["include_timeseries_component_loads"] is False + assert simulation_output_step["arguments"]["include_timeseries_unmet_hours"] is False + assert simulation_output_step["arguments"]["include_timeseries_zone_temperatures"] is False assert simulation_output_step["arguments"]["include_timeseries_airflows"] is False assert simulation_output_step["arguments"]["include_timeseries_weather"] is False assert simulation_output_step["arguments"]["include_timeseries_resilience"] is False - assert ( - simulation_output_step["arguments"]["timeseries_timestamp_convention"] == "end" - ) + assert simulation_output_step["arguments"]["timeseries_timestamp_convention"] == "end" assert simulation_output_step["arguments"]["timeseries_num_decimal_places"] == 3 assert simulation_output_step["arguments"]["add_timeseries_dst_column"] is True assert simulation_output_step["arguments"]["add_timeseries_utc_column"] is True @@ -333,9 +259,7 @@ def test_com_default_workflow_generator_extended(mocker): assert reporting_measure_step["measure_type"] == "ReportingMeasure" assert reporting_measure_step["arguments"] == {} # Should only be one instance of SimulationOutputReport - assert [ - d["measure_dir_name"] == "SimulationOutputReport" for d in osw["steps"] - ].count(True) == 1 + assert [d["measure_dir_name"] == "SimulationOutputReport" for d in osw["steps"]].count(True) == 1 # Should get TimeseriesCSVExport if included in args reporting_measure_step = osw["steps"][1] assert reporting_measure_step["measure_dir_name"] == "TimeseriesCSVExport" @@ -344,10 +268,7 @@ def test_com_default_workflow_generator_extended(mocker): assert reporting_measure_step["arguments"]["inc_output_variables"] == "true" # Should have the openstudio report reporting_measure_step = osw["steps"][2] - assert ( - reporting_measure_step["measure_dir_name"] - == "f8e23017-894d-4bdf-977f-37e3961e6f42" - ) + assert reporting_measure_step["measure_dir_name"] == "f8e23017-894d-4bdf-977f-37e3961e6f42" assert reporting_measure_step["measure_type"] == "ReportingMeasure" assert reporting_measure_step["arguments"]["building_summary_section"] == "true" assert reporting_measure_step["arguments"]["schedules_overview_section"] == "true" diff --git a/docs/conf.py b/docs/conf.py index 45c44c52..94ca7931 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,9 +20,7 @@ here = os.path.abspath(os.path.dirname(__file__)) metadata = {} -with open( - os.path.join(here, "..", "buildstockbatch", "__version__.py"), "r", encoding="utf-8" -) as f: +with open(os.path.join(here, "..", "buildstockbatch", "__version__.py"), "r", encoding="utf-8") as f: exec(f.read(), metadata) # -- Project information ----------------------------------------------------- @@ -75,9 +73,7 @@ # how to render changelog links changelog_render_ticket = "http://www.github.com/nrel/buildstockbatch/issues/%s" -changelog_render_pullreq = { - "default": "https://www.github.com/nrel/buildstockbatch/pull/%s" -} +changelog_render_pullreq = {"default": "https://www.github.com/nrel/buildstockbatch/pull/%s"} # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -180,9 +176,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, "buildstockbatch", "BuildStock Batch Documentation", [author], 1) -] +man_pages = [(master_doc, "buildstockbatch", "BuildStock Batch Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- diff --git a/setup.py b/setup.py index 669dd707..fd06bdc0 100644 --- a/setup.py +++ b/setup.py @@ -8,9 +8,7 @@ here = os.path.abspath(os.path.dirname(__file__)) metadata = {} -with open( - os.path.join(here, "buildstockbatch", "__version__.py"), "r", encoding="utf-8" -) as f: +with open(os.path.join(here, "buildstockbatch", "__version__.py"), "r", encoding="utf-8") as f: exec(f.read(), metadata) with open("README.md", "r", "utf-8") as f: From 7d2603cdb1431179167c844ee33ef47ac09947fc Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 16:56:10 -0700 Subject: [PATCH 43/53] updating black in pre-commit to 24.1.1 --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d5bdeb08..5fa30beb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: - id: end-of-file-fixer exclude_types: ["csv","tsv"] - repo: https://github.com/psf/black-pre-commit-mirror - rev: 23.10.1 + rev: 24.1.1 hooks: - id: black language_version: python3.11 From 49aac754290e2ed3cab6518f42117f0eaeb82853 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 16:57:08 -0700 Subject: [PATCH 44/53] fixing black for really reals --- buildstockbatch/workflow_generator/commercial.py | 12 ++++++------ .../workflow_generator/residential_hpxml.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/buildstockbatch/workflow_generator/commercial.py b/buildstockbatch/workflow_generator/commercial.py index 6495acfe..b0b1c4cd 100644 --- a/buildstockbatch/workflow_generator/commercial.py +++ b/buildstockbatch/workflow_generator/commercial.py @@ -120,16 +120,16 @@ def create_osw(self, sim_id, building_id, upgrade_idx): if "lifetime" in option: apply_upgrade_measure["arguments"]["option_{}_lifetime".format(opt_num)] = option["lifetime"] if "apply_logic" in option: - apply_upgrade_measure["arguments"][ - "option_{}_apply_logic".format(opt_num) - ] = self.make_apply_logic_arg(option["apply_logic"]) + apply_upgrade_measure["arguments"]["option_{}_apply_logic".format(opt_num)] = ( + self.make_apply_logic_arg(option["apply_logic"]) + ) for cost_num, cost in enumerate(option.get("costs", []), 1): for arg in ("value", "multiplier"): if arg not in cost: continue - apply_upgrade_measure["arguments"][ - "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) - ] = cost[arg] + apply_upgrade_measure["arguments"]["option_{}_cost_{}_{}".format(opt_num, cost_num, arg)] = ( + cost[arg] + ) if "package_apply_logic" in measure_d: apply_upgrade_measure["arguments"]["package_apply_logic"] = self.make_apply_logic_arg( measure_d["package_apply_logic"] diff --git a/buildstockbatch/workflow_generator/residential_hpxml.py b/buildstockbatch/workflow_generator/residential_hpxml.py index 71cab179..f4442bd2 100644 --- a/buildstockbatch/workflow_generator/residential_hpxml.py +++ b/buildstockbatch/workflow_generator/residential_hpxml.py @@ -523,16 +523,16 @@ def create_osw(self, sim_id, building_id, upgrade_idx): if "lifetime" in option: apply_upgrade_measure["arguments"]["option_{}_lifetime".format(opt_num)] = option["lifetime"] if "apply_logic" in option: - apply_upgrade_measure["arguments"][ - "option_{}_apply_logic".format(opt_num) - ] = self.make_apply_logic_arg(option["apply_logic"]) + apply_upgrade_measure["arguments"]["option_{}_apply_logic".format(opt_num)] = ( + self.make_apply_logic_arg(option["apply_logic"]) + ) for cost_num, cost in enumerate(option.get("costs", []), 1): for arg in ("value", "multiplier"): if arg not in cost: continue - apply_upgrade_measure["arguments"][ - "option_{}_cost_{}_{}".format(opt_num, cost_num, arg) - ] = cost[arg] + apply_upgrade_measure["arguments"]["option_{}_cost_{}_{}".format(opt_num, cost_num, arg)] = ( + cost[arg] + ) if "package_apply_logic" in measure_d: apply_upgrade_measure["arguments"]["package_apply_logic"] = self.make_apply_logic_arg( measure_d["package_apply_logic"] From 1f4d8e528bace5fe59959b8872e0cca047ddb7c0 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Tue, 6 Feb 2024 16:59:11 -0700 Subject: [PATCH 45/53] pinning black to ~= 24.0 in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fd06bdc0..cd48c8a6 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ "sphinx_paramlinks", "changelog", "flake8", - "black", + "black~=24.0", "rope", "doc8", "pre-commit", From a254c19dc41b4886dcf6892b9aad4a42a04226b6 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 8 Feb 2024 15:12:37 -0700 Subject: [PATCH 46/53] fixing testing --- buildstockbatch/aws/aws.py | 16 ++++++---------- buildstockbatch/base.py | 4 ++-- buildstockbatch/test/test_aws.py | 8 -------- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index ea0a5e2a..0d602931 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -17,8 +17,6 @@ import csv from dask.distributed import Client from dask_cloudprovider.aws import FargateCluster -import docker -from fsspec.implementations.local import LocalFileSystem import gzip from joblib import Parallel, delayed import json @@ -27,6 +25,7 @@ import pathlib import random from s3fs import S3FileSystem +import shutil import tarfile import re import tempfile @@ -35,12 +34,10 @@ import io import yaml -from buildstockbatch.base import ValidationError, BuildStockBatchBase +from buildstockbatch.base import ValidationError from buildstockbatch.aws.awsbase import AwsJobBase, boto_client_config from buildstockbatch.cloud.docker_base import DockerBatchBase -from buildstockbatch import postprocessing from buildstockbatch.utils import ( - ContainerRuntime, log_error_details, get_project_configuration, get_bool_env_var, @@ -1074,20 +1071,19 @@ def build_image(self): raise RuntimeError(f"The needs to be run from the root of the repo, found {root_path}") # Make the buildstock/resources/.aws_docker_image dir to store logs - local_log_dir = os.path.join(self.buildstock_dir, "resources", ".aws_docker_image") + local_log_dir = pathlib.Path(self.buildstock_dir, "resources", ".aws_docker_image") if not os.path.exists(local_log_dir): os.makedirs(local_log_dir) # Determine whether or not to build the image with custom gems bundled in if self.cfg.get("baseline", dict()).get("custom_gems", False): # Ensure the custom Gemfile exists in the buildstock dir - local_gemfile_path = os.path.join(self.buildstock_dir, "resources", "Gemfile") - if not os.path.exists(local_gemfile_path): + local_gemfile_path = pathlib.Path(self.buildstock_dir, "resources", "Gemfile") + if not local_gemfile_path.exists(): raise AttributeError(f"baseline:custom_gems = True, but did not find Gemfile at {local_gemfile_path}") # Copy the custom Gemfile into the buildstockbatch repo - bsb_root = os.path.join(os.path.abspath(__file__), os.pardir, os.pardir, os.pardir) - new_gemfile_path = os.path.join(bsb_root, "Gemfile") + new_gemfile_path = root_path / "Gemfile" shutil.copyfile(local_gemfile_path, new_gemfile_path) logger.info(f"Copying custom Gemfile from {local_gemfile_path}") diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index b6ded0a6..57de30d6 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -939,5 +939,5 @@ def process_results(self, skip_combine=False, use_dask_cluster=True): if use_dask_cluster: self.cleanup_dask() - # keep_individual_timeseries = self.cfg.get('postprocessing', {}).get('keep_individual_timeseries', False) - # postprocessing.remove_intermediate_files(fs, self.results_dir, keep_individual_timeseries) + keep_individual_timeseries = self.cfg.get("postprocessing", {}).get("keep_individual_timeseries", False) + postprocessing.remove_intermediate_files(fs, self.results_dir, keep_individual_timeseries) diff --git a/buildstockbatch/test/test_aws.py b/buildstockbatch/test/test_aws.py index 322885e0..3577f613 100644 --- a/buildstockbatch/test/test_aws.py +++ b/buildstockbatch/test/test_aws.py @@ -23,10 +23,6 @@ def test_custom_gem_install(basic_residential_project_file): cfg["aws"]["s3"] = {} cfg["aws"]["s3"]["bucket"] = "resbldg-datasets" cfg["aws"]["s3"]["prefix"] = "testing/external_demo_project" - cfg["aws"]["emr"] = {} - cfg["aws"]["emr"]["manager_instance_type"] = "m5.xlarge" - cfg["aws"]["emr"]["worker_instance_type"] = "r5.4xlarge" - cfg["aws"]["emr"]["worker_instance_count"] = 1 cfg["aws"]["region"] = "us-west-2" cfg["aws"]["use_spot"] = True cfg["aws"]["batch_array_size"] = 100 @@ -64,10 +60,6 @@ def test_no_custom_gem_install(basic_residential_project_file): cfg["aws"]["s3"] = {} cfg["aws"]["s3"]["bucket"] = "resbldg-datasets" cfg["aws"]["s3"]["prefix"] = "testing/external_demo_project" - cfg["aws"]["emr"] = {} - cfg["aws"]["emr"]["manager_instance_type"] = "m5.xlarge" - cfg["aws"]["emr"]["worker_instance_type"] = "r5.4xlarge" - cfg["aws"]["emr"]["worker_instance_count"] = 1 cfg["aws"]["region"] = "us-west-2" cfg["aws"]["use_spot"] = True cfg["aws"]["batch_array_size"] = 100 From 906c8e0ea6cffc8cd4ecc3fd0b112d8af0ac0043 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Thu, 8 Feb 2024 15:52:10 -0700 Subject: [PATCH 47/53] consolidating as per @lathanh's comment --- buildstockbatch/postprocessing.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 0937185c..37ed938c 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -554,11 +554,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): f"partitions which go into {ngroup} column group(s) of {partition_columns}" ) - if isinstance(fs, LocalFileSystem): - ts_out_loc = f"{ts_dir}/upgrade={upgrade_id}/" - else: - assert isinstance(fs, S3FileSystem) - ts_out_loc = f"s3://{ts_dir}/upgrade={upgrade_id}" + ts_out_loc = f"{ts_dir}/upgrade={upgrade_id}" fs.makedirs(ts_out_loc) logger.info(f"Created directory {ts_out_loc} for writing. Now concatenating ...") From a015e3f74d01a33ca65efdb9598a0016e63f9ff8 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 19 Feb 2024 15:59:49 -0700 Subject: [PATCH 48/53] switching some error checking to our backoff function --- buildstockbatch/aws/aws.py | 48 ++++++++++++-------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 0d602931..0bcd4f5b 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -175,20 +175,11 @@ def create_vpc(self): self.vpc_id = response["Vpc"]["VpcId"] logger.info(f"VPC {self.vpc_id} created") - while True: - try: - self.ec2.create_tags( - Resources=[self.vpc_id], - Tags=self.get_tags_uppercase(Name=self.job_identifier), - ) - break - except Exception as e: - if "InvalidVpcID.NotFound" in str(e): - logger.info("Cannot tag VPC. VPC not yet created. Sleeping...") - time.sleep(5) - else: - raise - + backoff( + self.ec2.create_tags, + Resources=[self.vpc_id], + Tags=self.get_tags_uppercase(Name=self.job_identifier), + ) # Find the default security group sec_response = self.ec2.describe_security_groups( @@ -728,26 +719,17 @@ def submit_job(self, array_size=4): Submits the created job definition and version to be run. """ - while True: - try: - resp = self.batch.submit_job( - jobName=self.job_identifier, - jobQueue=self.batch_job_queue_name, - arrayProperties={"size": array_size}, - jobDefinition=self.job_definition_arn, - tags=self.get_tags(), - ) - - logger.info(f"Job {self.job_identifier} submitted.") - return resp + resp = backoff( + self.batch.submit_job, + jobName=self.job_identifier, + jobQueue=self.batch_job_queue_name, + arrayProperties={"size": array_size}, + jobDefinition=self.job_definition_arn, + tags=self.get_tags(), + ) - except Exception as e: - if "not in VALID state" in str(e): - # Need to wait a second for the compute environment to complete registration - logger.warning("5 second sleep initiated to wait for job queue creation due to error: " + str(e)) - time.sleep(5) - else: - raise + logger.info(f"Job {self.job_identifier} submitted.") + return resp def clean(self): # Get our vpc: From 863e92790294e54f7bc1dedb7e28856d9d072e9e Mon Sep 17 00:00:00 2001 From: Natalie Weires Date: Wed, 6 Mar 2024 13:42:56 -0500 Subject: [PATCH 49/53] Add script to sample a single location (#70) --- buildstockbatch/sample_one_county.py | 215 +++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 buildstockbatch/sample_one_county.py diff --git a/buildstockbatch/sample_one_county.py b/buildstockbatch/sample_one_county.py new file mode 100644 index 00000000..f2550fcd --- /dev/null +++ b/buildstockbatch/sample_one_county.py @@ -0,0 +1,215 @@ +"""Runs the residental quota sampler for a single county+PUMA. + +Usage: + python3 sample_one_county.py --help + + python3 sample_one_county.py G1900030 G19001800 100,200 path/to/resstock path/to/output_dir + + - Generates two files where every building has county=G1900030 and PUMA=G19001800: + path/to/output_dir/buildstock_G1900030_G19001800_100.csv with 100 samples + path/to/output_dir/buildstock_G1900030_G19001800_200.csv with 200 samples + +Methodology: + This modifies the conditional probability distributions from the standard ResStock national project + to create a sample limited to a single county+PUMA. (For example, the selected location may normally + be used for 1% of buildings in a national sample, but we update it to get 100% of buildings while + every other location gets 0%.) + + To do this, we modify two files: + - ASHRAE IECC Climate Zone 2004.tsv + - Make 100% of the samples fall into the climate zone of the selected location. + - County and PUMA.tsv + - Make 100% of samples (within the chosen climate zone) fall into the selected county + PUMA + + All other housing characteristics are downstream of these (or don't depend on them) and are unchanged. + +Assumptions: + This logic is only guaranteed to work for the current ResStock national project. Other changes + to the dependencies between the variables can break it! + + In particular, this code assumes: + - ASHRAE climate zone has no dependencies + - County and PUMA depends only on the ASHRAE climate zone + - Each County+PUMA fall entirely in one climate zone +""" +import argparse +import csv +import os +import shutil +import tempfile + +from buildstockbatch.utils import ContainerRuntime +from sampler import residential_quota + + +class SampleOnly: + CONTAINER_RUNTIME = ContainerRuntime.DOCKER + + def __init__(self, buildstock_dir, output_dir): + # Sampler uses this to find the sampling scripts + self.buildstock_dir = os.path.abspath(buildstock_dir) + + # ResStock national project. Could use a different project, but `County and PUMA.tsv` and + # `ASHRAE IECC Climate Zone 2004.tsv` must exist in the expected format. + self.project_dir = os.path.join(self.buildstock_dir, "project_national") + + # Directory containing the conditional probability distributions we plan to modify + self.housing_characteristics_dir = os.path.join(self.project_dir, "housing_characteristics") + self.output_dir = output_dir + os.makedirs(output_dir, exist_ok=True) + + @property + def docker_image(self): + return "nrel/openstudio:{}".format(self.os_version) + + @property + def os_version(self): + return "3.7.0" + + @property + def project_filename(self): + """Sampler expects this property to exist, but it can be None.""" + return None + + def get_climate_zone(self, county, PUMA): + """Given a county and PUMA, find the climate zone that contains them. + + :param county: GISJOIN ID of county (e.g. "G1900030") + :param PUMA: GISJOIN ID of PUMA (e.g. "G19001800") + + :return: Climate zone string (e.g. "3A") + """ + with open(os.path.join(self.housing_characteristics_dir, "County and PUMA.tsv")) as f: + reader = csv.reader(f, delimiter="\t") + headers = next(reader) + # Index of the column with the county and PUMA we're looking for. + try: + location_col = headers.index(f"Option={county}, {PUMA}") + except ValueError as e: + raise ValueError(f"Could not find 'Option={county}, {PUMA}' column in 'County and PUMA.tsv'") from e + + zone = None + for row in reader: + # Skip comments + if row[0].strip()[0] == "#": + continue + + # Find the zone with a non-zero chance of producing this county + PUMA + if row[location_col] != "0": + if zone: + raise ValueError(f"Found multiple climate zones for {county}, {PUMA}") + zone = row[0] + + if not zone: + raise ValueError(f"No climate zone found for {county}, {PUMA}") + return zone + + def run_sampler(self, county, PUMA, n_samples): + """ + Create the requested number of buildings, all contained in the given county and PUMA. + + This function: + - Updates the conditional probability distributions for climate zone and county + PUMA. + - Runs the ResidentialQuotaSampler. + - Renames and copies the resulting building.csv file into the output directory. + + :param county: GISJOIN ID of county (e.g. "G1900030") + :param PUMA: GISJOIN ID of PUMA (e.g. "G19001800") + :param n_samples: Number of building samples to produce. + """ + + climate_zone = self.get_climate_zone(county, PUMA) + # Create a new copy of the probability distribution TSV files, so we can change them without + # affecting the originals. + with tempfile.TemporaryDirectory(prefix="sampling_", dir=self.buildstock_dir) as tmpdir: + temp_housing_characteristics_dir = os.path.join(tmpdir, "housing_characteristics") + shutil.copytree(self.housing_characteristics_dir, temp_housing_characteristics_dir) + + # Update climate zone TSV + climate_zone_filename = "ASHRAE IECC Climate Zone 2004.tsv" + zone_tsv = os.path.join(self.housing_characteristics_dir, climate_zone_filename) + new_zone_tsv = os.path.join(temp_housing_characteristics_dir, climate_zone_filename) + with open(zone_tsv) as old_f: + reader = csv.reader(old_f, delimiter="\t") + with open(new_zone_tsv, "w") as new_f: + writer = csv.writer(new_f, delimiter="\t") + headers = next(reader) + writer.writerow(headers) + + # This file has a single row of probabilities, which we replace with 0s and a single 1. + zone_header = f"Option={climate_zone}" + writer.writerow(["1" if header == zone_header else "0" for header in headers]) + + # Update county + PUMA TSV + county_filename = "County and PUMA.tsv" + county_tsv = os.path.join(self.housing_characteristics_dir, county_filename) + new_county_tsv = os.path.join(temp_housing_characteristics_dir, county_filename) + with open(county_tsv) as old_f: + reader = csv.reader(old_f, delimiter="\t") + with open(new_county_tsv, "w") as new_f: + writer = csv.writer(new_f, delimiter="\t") + headers = next(reader) + writer.writerow(headers) + + # First value in headers lists the climate zone dependency - + # just use the others, which list the County+PUMA options. + assert headers[0] == "Dependency=ASHRAE IECC Climate Zone 2004" + headers = headers[1:] + for row in reader: + # Skip comments + if row[0].strip()[0] == "#": + continue + + elif row[0] == climate_zone: + # Replace probabilities with 1 for our selected location and 0s everywhere else. + county_header = f"Option={county}, {PUMA}" + writer.writerow( + [row[0]] + ["1" if headers[i] == county_header else "0" for i, v in enumerate(row[1:])] + ) + + else: + # Leave other climate zones unchanged - they won't be used anyway. + writer.writerow(row) + + self.cfg = {"project_directory": os.path.basename(tmpdir)} + self.project_dir = tmpdir + + # Note: Must create sampler after all instances vars exist, because it makes a copy of this object. + sampler = residential_quota.ResidentialQuotaSampler(self, n_samples) + sampler.run_sampling() + + # Copy results from temp dir to output dir + shutil.copy( + os.path.join(temp_housing_characteristics_dir, "buildstock.csv"), + os.path.join(self.output_dir, f"buildstock_{county}_{PUMA}_{n_samples}.csv"), + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("county", help="County GISJOIN ID - https://www.nhgis.org/geographic-crosswalks#geog-ids") + parser.add_argument("PUMA", help="PUMA GISJOIN ID") + parser.add_argument("n_samples", help="Comma-separated list of samples sizes to generate") + parser.add_argument("buildstock_dir", help="Path to the ResStock directory (expected to contain project_national)") + parser.add_argument( + "output_dir", + default=".", + nargs="?", + help="Optional path where output should be written. Defaults to the current directory.", + ) + args = parser.parse_args() + + assert ( + len(args.county) == 8 and args.county[0] == "G" + ), "County should be 8 chars and start with G (e.g. 'G0100010')" + assert len(args.PUMA) == 9 and args.PUMA[0] == "G", "PUMA should be 9 chars and start with G (e.g. 'G01002100')" + + sample_sizes = [int(i) for i in args.n_samples.split(",")] + s = SampleOnly(args.buildstock_dir, args.output_dir) + for i in sample_sizes: + print(f"Creating {i} samples...") + s.run_sampler(args.county, args.PUMA, i) + + +if __name__ == "__main__": + main() From e57bc995a6e6bc05d9a8c2fc46e2ac787ea734c0 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Fri, 15 Mar 2024 10:07:27 -0600 Subject: [PATCH 50/53] fixing typo --- buildstockbatch/aws/aws.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 0bcd4f5b..5909e35b 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -679,7 +679,7 @@ def create_job_queue(self): elif "is not valid" in str(e): # Need to wait a second for the compute environment to complete registration - logger.warning("wating a few seconds for compute environment creation: " + str(e)) + logger.warning("waiting a few seconds for compute environment creation: " + str(e)) time.sleep(5) else: From 54fca58a3c72ca89b9783c3026fefb6b4949e559 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Fri, 15 Mar 2024 10:24:53 -0600 Subject: [PATCH 51/53] backoffs in the create_vpc function --- buildstockbatch/aws/aws.py | 96 +++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 54 deletions(-) diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index 5909e35b..0b417c8d 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -167,7 +167,8 @@ def create_vpc(self): # Create the VPC - response = self.ec2.create_vpc( + response = backoff( + self.ec2.create_vpc, CidrBlock=self.vpc_cidr, AmazonProvidedIpv6CidrBlock=False, InstanceTenancy="default", @@ -192,7 +193,8 @@ def create_vpc(self): logger.info(f"Security group {self.batch_security_group} created for vpc/job.") - response = self.ec2.authorize_security_group_ingress( + response = backoff( + self.ec2.authorize_security_group_ingress, GroupId=self.batch_security_group, IpPermissions=[ { @@ -208,7 +210,8 @@ def create_vpc(self): # Create the private subnets - priv_response_1 = self.ec2.create_subnet( + priv_response_1 = backoff( + self.ec2.create_subnet, CidrBlock=self.priv_subnet_cidr_1, AvailabilityZone=f"{self.region}a", VpcId=self.vpc_id, @@ -218,7 +221,8 @@ def create_vpc(self): logger.info("Private subnet created.") - priv_response_2 = self.ec2.create_subnet( + priv_response_2 = backoff( + self.ec2.create_subnet, CidrBlock=self.priv_subnet_cidr_2, AvailabilityZone=f"{self.region}b", VpcId=self.vpc_id, @@ -254,7 +258,7 @@ def create_vpc(self): # Create the public subnet - pub_response = self.ec2.create_subnet(CidrBlock=self.pub_subnet_cidr, VpcId=self.vpc_id) + pub_response = backoff(self.ec2.create_subnet, CidrBlock=self.pub_subnet_cidr, VpcId=self.vpc_id) logger.info("EIP allocated.") @@ -268,25 +272,21 @@ def create_vpc(self): # Create and elastic IP for the NAT Gateway - try: - ip_response = self.ec2.allocate_address(Domain="vpc") - - self.nat_ip_allocation = ip_response["AllocationId"] + ip_response = backoff(self.ec2.allocate_address, Domain="vpc") - logger.info("EIP allocated.") + self.nat_ip_allocation = ip_response["AllocationId"] - self.ec2.create_tags( - Resources=[self.nat_ip_allocation], - Tags=self.get_tags_uppercase(Name=self.job_identifier), - ) + logger.info("EIP allocated.") - except Exception as e: - if "AddressLimitExceeded" in str(e): - raise + backoff( + self.ec2.create_tags, + Resources=[self.nat_ip_allocation], + Tags=self.get_tags_uppercase(Name=self.job_identifier), + ) # Create an internet gateway - self.ec2.attach_internet_gateway(InternetGatewayId=self.internet_gateway_id, VpcId=self.vpc_id) + backoff(self.ec2.attach_internet_gateway, InternetGatewayId=self.internet_gateway_id, VpcId=self.vpc_id) logger.info("Internet Gateway attached.") @@ -302,26 +302,18 @@ def create_vpc(self): # Modify the default route table to be used as the public route - while True: - try: - self.ec2.create_route( - DestinationCidrBlock="0.0.0.0/0", - GatewayId=self.internet_gateway_id, - RouteTableId=self.pub_route_table_id, - ) - logger.info("Route created for Internet Gateway.") - break - - except Exception as e: - if "NotFound" in str(e): - time.sleep(5) - logger.info("Internet Gateway not yet created. Sleeping...") - else: - raise + backoff( + self.ec2.create_route, + DestinationCidrBlock="0.0.0.0/0", + GatewayId=self.internet_gateway_id, + RouteTableId=self.pub_route_table_id, + ) # Create a NAT Gateway - nat_response = self.ec2.create_nat_gateway(AllocationId=self.nat_ip_allocation, SubnetId=self.pub_vpc_subnet_id) + nat_response = backoff( + self.ec2.create_nat_gateway, AllocationId=self.nat_ip_allocation, SubnetId=self.pub_vpc_subnet_id + ) self.nat_gateway_id = nat_response["NatGateway"]["NatGatewayId"] @@ -335,7 +327,7 @@ def create_vpc(self): # Create a new private route table - prt_response = self.ec2.create_route_table(VpcId=self.vpc_id) + prt_response = backoff(self.ec2.create_route_table, VpcId=self.vpc_id) self.priv_route_table_id = prt_response["RouteTable"]["RouteTableId"] @@ -349,31 +341,27 @@ def create_vpc(self): # Associate the private route to the private subnet - self.ec2.associate_route_table(RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_1) + backoff( + self.ec2.associate_route_table, RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_1 + ) logger.info("Route table associated with subnet.") - self.ec2.associate_route_table(RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_2) + backoff( + self.ec2.associate_route_table, RouteTableId=self.priv_route_table_id, SubnetId=self.priv_vpc_subnet_id_2 + ) logger.info("Route table associated with subnet.") # Associate the NAT gateway with the private route - while True: - try: - self.ec2.create_route( - DestinationCidrBlock="0.0.0.0/0", - NatGatewayId=self.nat_gateway_id, - RouteTableId=self.priv_route_table_id, - ) - logger.info("Route created for subnet.") - break - except Exception as e: - if "InvalidNatGatewayID.NotFound" in str(e): - time.sleep(5) - logger.info("Nat Gateway not yet created. Sleeping...") - else: - raise + backoff( + self.ec2.create_route, + DestinationCidrBlock="0.0.0.0/0", + NatGatewayId=self.nat_gateway_id, + RouteTableId=self.priv_route_table_id, + ) - gateway_response = self.ec2.create_vpc_endpoint( + gateway_response = backoff( + self.ec2.create_vpc_endpoint, VpcId=self.vpc_id, ServiceName=f"com.amazonaws.{self.region}.s3", RouteTableIds=[self.priv_route_table_id, self.pub_route_table_id], From 1cdeb5ecfeb41a5f63a2d473b5fa4001a6cfd348 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Fri, 15 Mar 2024 11:47:46 -0600 Subject: [PATCH 52/53] update docs --- buildstockbatch/schemas/v0.3.yaml | 7 ---- docs/installation.rst | 50 +++++++++++++++++++++++---- docs/project_defn.rst | 56 ++++++++++++++++++++----------- docs/run_sims.rst | 26 ++++++++++---- 4 files changed, 100 insertions(+), 39 deletions(-) diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml index 5ccdf78c..f0ab72e2 100644 --- a/buildstockbatch/schemas/v0.3.yaml +++ b/buildstockbatch/schemas/v0.3.yaml @@ -26,7 +26,6 @@ aws-spec: spot_bid_percent: num(min=1, max=100, required=False) batch_array_size: num(min=1, max=10000, required=True) notifications_email: regex('^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', name='email', required=True) - emr: include('aws-emr-spec', required=False) dask: include('aws-dask-spec', required=True) job_environment: include('aws-job-environment', required=False) tags: map(str(), str(), required=False) @@ -35,12 +34,6 @@ aws-job-environment: vcpus: int(min=1, max=36, required=False) memory: int(min=1024, required=False) -aws-emr-spec: - manager_instance_type: str(required=False) - worker_instance_type: str(required=False) - worker_instance_count: int(min=1, required=False) - dask_worker_vcores: int(min=1, required=False) - aws-dask-spec: scheduler_cpu: enum(1024, 2048, 4096, 8192, 16384, required=False) scheduler_memory: int(min=1024, required=False) diff --git a/docs/installation.rst b/docs/installation.rst index 50cf380a..a6d19b52 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -65,6 +65,8 @@ For Windows, the process is similar. .. _OpenStudio release: https://github.com/NREL/OpenStudio/releases +.. _bsb-python: + BuildStockBatch Python Library .............................. @@ -231,10 +233,46 @@ Amazon Web Services (Beta) .. warning:: - The AWS version of buildstockbatch is currently broken. A remedy is in - progress. Thanks for your patience. + The AWS version of buildstockbatch is in active development. Use at your own + risk. It's provided as-is with no promise of support. + +Docker +...... + +Install either `Docker Desktop `_ of +`Docker Engine `_ for your platform. + +BuildStockBatch Python Library +.............................. + +Instal the buildstockbatch python library as described in :ref:`bsb-python` for +the local installation. You'll need to install with the ``aws`` extra as follows. + +For a standard installation + +:: + + cd /path/to/buildstockbatch + python -m pip install -e ".[aws]" + +For developer installation + +:: + + cd /path/to/buildstockbatch + python -m pip install -e ".[dev,aws]" + pre-commit install + +AWS User Configuration +...................... + +Follow the instructions for :ref:`aws-user-config-local` on the local install. +Your AWS user account or role needs to have pretty expansive permissions to +create IAM roles, VPCs, compute resources, etc. + +.. todo:: + + Define permission set needed. -The installation instructions are the same as the :ref:`local-install` -installation. You will need to use an AWS account with appropriate permissions. -The first time you run ``buildstock_aws`` it may take several minutes, -especially over a slower internet connection as it is downloading and building a docker image. + For NREL users, the typical ``resbldg-user`` or ``developers`` role in the + nrel-aws-resbldg account is probably insufficient. diff --git a/docs/project_defn.rst b/docs/project_defn.rst index c592c1b8..b85c8413 100644 --- a/docs/project_defn.rst +++ b/docs/project_defn.rst @@ -218,42 +218,59 @@ on the `AWS Batch `_ service. on AWS. In a future version we will break backwards compatibility in the config file and have more consistent options. -* ``job_identifier``: A unique string that starts with an alphabetical character, +* ``job_identifier``: (required) A unique string that starts with an alphabetical character, is up to 10 characters long, and only has letters, numbers or underscore. This is used to name all the AWS service objects to be created and differentiate it from other jobs. -* ``s3``: Configuration for project data storage on s3. When running on AWS, +* ``s3``: (required) Configuration for project data storage on s3. When running on AWS, this overrides the s3 configuration in the :ref:`post-config-opts`. * ``bucket``: The s3 bucket this project will use for simulation output and processed data storage. * ``prefix``: The s3 prefix at which the data will be stored. -* ``region``: The AWS region in which the batch will be run and data stored. -* ``use_spot``: true or false. Defaults to false if missing. This tells the project +* ``region``: (required) The AWS region in which the batch will be run and data stored. Probably "us-west-2" if you're at NREL. +* ``use_spot``: (optional) true or false. Defaults to true if missing. This tells the project to use the `Spot Market `_ for data simulations, which typically yields about 60-70% cost savings. -* ``spot_bid_percent``: Percent of on-demand price you're willing to pay for +* ``spot_bid_percent``: (optional) Percent of on-demand price you're willing to pay for your simulations. The batch will wait to run until the price drops below this - level. -* ``batch_array_size``: Number of concurrent simulations to run. Max: 10000. -* ``notifications_email``: Email to notify you of simulation completion. + level. Usually leave this one blank. +* ``batch_array_size``: (required) Number of concurrent simulations to run. Max: 10,000. + Unless this is a small run with fewer than 100,000 simulations, just set this + to 10,000. +* ``notifications_email``: (required) Email to notify you of simulation completion. You'll receive an email at the beginning where you'll need to accept the - subscription to receive further notification emails. -* ``emr``: Optional key to specify options for postprocessing using an EMR cluster. Generally the defaults should work fine. - - * ``manager_instance_type``: The `instance type`_ to use for the EMR master node. Default: ``m5.xlarge``. - * ``worker_instance_type``: The `instance type`_ to use for the EMR worker nodes. Default: ``r5.4xlarge``. - * ``worker_instance_count``: The number of worker nodes to use. Same as ``eagle.postprocessing.n_workers``. - Increase this for a large dataset. Default: 2. - * ``dask_worker_vcores``: The number of cores for each dask worker. Increase this if your dask workers are running out of memory. Default: 2. + subscription to receive further notification emails. This doesn't work right now. +* ``dask``: (required) Dask configuration for postprocessing + + * ``n_workers``: (required) Number of dask workers to use. + * ``scheduler_cpu``: (optional) One of ``[1024, 2048, 4096, 8192, 16384]``. + Default: 2048. CPU to allocate for the scheduler task. 1024 = 1 VCPU. See + `Fargate Task CPU and memory`_ for allowable combinations of CPU and + memory. + * ``scheduler_memory``: (optional) Amount of memory to allocate to the + scheduler task. Default: 8192. See `Fargate Task CPU and memory`_ for + allowable combinations of CPU and memory. + * ``worker_cpu``: (optional) One of ``[1024, 2048, 4096, 8192, 16384]``. + Default: 2048. CPU to allocate for the worker tasks. 1024 = 1 VCPU. See + `Fargate Task CPU and memory`_ for allowable combinations of CPU and + memory. + * ``worker_memory``: (optional) Amount of memory to allocate to the worker + tasks. Default: 8192. See `Fargate Task CPU and memory`_ for allowable + combinations of CPU and memory. * ``job_environment``: Specifies the computing requirements for each simulation. - * ``vcpus``: Number of CPUs needed. default: 1. - * ``memory``: Amount of RAM memory needed for each simulation in MiB. default 1024. For large multifamily buildings + * ``vcpus``: (optional) Number of CPUs needed. Default: 1. This probably doesn't need to be changed. + * ``memory``: (optional) Amount of RAM memory needed for each simulation in MiB. default 1024. For large multifamily buildings this works better if set to 2048. +* ``tags``: (optional) This is a list of key-value pairs to attach as tags to + all the AWS objects created in the process of running the simulation. If you + are at NREL, please fill out the following tags so we can track and allocate + costs: ``billingId``, ``org``, and ``owner``. .. _instance type: https://aws.amazon.com/ec2/instance-types/ +.. _Fargate Task CPU and memory: https://docs.aws.amazon.com/AmazonECS/latest/developerguide/fargate-tasks-services.html#fargate-tasks-size .. _postprocessing: @@ -288,7 +305,8 @@ Athena. This process requires appropriate access to an AWS account to be configured on your machine. You will need to set this up wherever you use buildstockbatch. If you don't have keys, consult your AWS administrator to get them set up. The appropriate keys are already installed on Eagle and Kestrel, so -no action is required. +no action is required. If you run on AWS, this step is already done since the +simulation outputs are already on S3. * :ref:`Local AWS setup instructions ` * `Detailed instructions from AWS `_ diff --git a/docs/run_sims.rst b/docs/run_sims.rst index 6d99ca51..91d5aafb 100644 --- a/docs/run_sims.rst +++ b/docs/run_sims.rst @@ -84,6 +84,11 @@ tool. .. command-output:: buildstock_aws --help :ellipsis: 0,8 +The first time you run it may take several minutes to build and upload the +docker image. ``buildstock_aws`` needs to stay running and connected to the +internet while the batch simulation is running on AWS. We have found it useful +to run from an EC2 instance for convenience, but that is not strictly necessary. + AWS Specific Project configuration .................................. @@ -93,7 +98,7 @@ file, something like this: .. code-block:: yaml aws: - # The job_identifier should be unique, start with alpha, and limited to 10 chars or data loss can occur + # The job_identifier should be unique, start with alpha, and limited to 10 chars job_identifier: national01 s3: bucket: myorg-resstock @@ -101,15 +106,22 @@ file, something like this: region: us-west-2 use_spot: true batch_array_size: 10000 - # To receive email updates on job progress accept the request to receive emails that will be sent from Amazon - notifications_email: your_email@somewhere.com + dask: + n_workers: 8 + notifications_email: your_email@somewhere.com # doesn't work right now See :ref:`aws-config` for details. Cleaning up after yourself .......................... -When the simulation and postprocessing is all complete, run ``buildstock_aws ---clean your_project_file.yml``. This will clean up all the AWS resources that -were created on your behalf to run the simulations. Your results will still be -on S3 and queryable in Athena. +When the batch is done, ``buildstock_aws`` should clean up after itself. +However, if something goes wrong, the cleanup script can be run with the +``--clean`` option like so: + +:: + + buildstock_aws --clean your_project_file.yml + +This will clean up all the AWS resources that were created on your behalf to run +the simulations. Your results will still be on S3 and queryable in Athena. From 377c75855d9f32bda1ef609f8718ca45415f1f4a Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Fri, 15 Mar 2024 11:50:12 -0600 Subject: [PATCH 53/53] update changelog [skip ci] --- docs/changelog/changelog_dev.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst index bd619d14..d13ed5cd 100644 --- a/docs/changelog/changelog_dev.rst +++ b/docs/changelog/changelog_dev.rst @@ -35,3 +35,9 @@ Development Changelog :pullreq: 426 A bugfix for gracefully handling empty data_point_out.json files. + + .. change:: + :tags: aws, feature + :pullreq: 345 + + Major update to get AWS Batch run environment working.