From 3231b329d99eff19dff986ace6cc152d7482a06e Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 1 Nov 2018 17:05:00 -0700 Subject: [PATCH 01/28] first pass at ssh submit log streaming --- aztk/client/base/base_operations.py | 2 +- .../base/helpers/get_application_log.py | 6 ++- aztk/internal/cluster_data/blob_data.py | 6 +-- aztk/internal/cluster_data/cluster_data.py | 21 +++++---- aztk/node_scripts/core/config.py | 43 ----------------- aztk/node_scripts/scheduling/common.py | 46 +++++++++++++++++-- aztk/node_scripts/scheduling/submit.py | 4 +- aztk/utils/azure_api.py | 3 +- 8 files changed, 63 insertions(+), 68 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index a1cff5be..5dca32ce 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -12,7 +12,7 @@ class BaseOperations: Attributes: batch_client (:obj:`azure.batch.batch_service_client.BatchServiceClient`): Client used to interact with the Azure Batch service. - blob_client (:obj:`azure.storage.blob.BlockBlobService`): Client used to interact with the Azure Storage + blob_client (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the Azure Storage Blob service. secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): Model that holds AZTK secrets used to authenticate with Azure and the clusters. diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index b992f2b0..dc8ffc8d 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -51,7 +51,7 @@ def __get_output_file_properties(batch_client, cluster_id: str, application_name def get_log_from_storage(blob_client, container_name, application_name, task): """ Args: - blob_client (:obj:`azure.storage.blob.BlockBlobService`): Client used to interact with the Azure Storage + block_blob_client (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the Azure Storage Blob service. container_name (:obj:`str`): the name of the Azure Blob storage container to get data from application_name (:obj:`str`): the name of the application to get logs for @@ -59,7 +59,9 @@ def get_log_from_storage(blob_client, container_name, application_name, task): """ try: - blob = blob_client.get_blob_to_text(container_name, application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE) + block_blob_client = blob_client.create_block_blob_client() + blob = block_blob_client.get_blob_to_text(container_name, + application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE) except azure.common.AzureMissingResourceHttpError: raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") diff --git a/aztk/internal/cluster_data/blob_data.py b/aztk/internal/cluster_data/blob_data.py index 6e3fe117..699d8eed 100644 --- a/aztk/internal/cluster_data/blob_data.py +++ b/aztk/internal/cluster_data/blob_data.py @@ -13,16 +13,16 @@ def __init__(self, blob_client: BlockBlobService, container: str, blob: str): self.container = container self.blob = blob self.dest = blob - self.blob_client = blob_client + self.block_blob_client = blob_client def to_resource_file(self, dest: str = None) -> batch_models.ResourceFile: - sas_token = self.blob_client.generate_blob_shared_access_signature( + sas_token = self.block_blob_client.generate_blob_shared_access_signature( self.container, self.blob, permission=BlobPermissions.READ, expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365), ) - sas_url = self.blob_client.make_blob_url(self.container, self.blob, sas_token=sas_token) + sas_url = self.block_blob_client.make_blob_url(self.container, self.blob, sas_token=sas_token) return batch_models.ResourceFile(file_path=dest or self.dest, blob_source=sas_url) diff --git a/aztk/internal/cluster_data/cluster_data.py b/aztk/internal/cluster_data/cluster_data.py index adc10d5c..aae5a8de 100644 --- a/aztk/internal/cluster_data/cluster_data.py +++ b/aztk/internal/cluster_data/cluster_data.py @@ -3,6 +3,7 @@ import azure.common import yaml +from azure.storage.common import CloudStorageAccount from msrest.exceptions import ClientRequestError from aztk import error @@ -23,8 +24,8 @@ class ClusterData: APPLICATIONS_DIR = "applications" CLUSTER_CONFIG_FILE = "config.yaml" - def __init__(self, blob_client, cluster_id: str): - self.blob_client = blob_client + def __init__(self, blob_client: CloudStorageAccount, cluster_id: str): + self.block_blob_client = blob_client.create_block_blob_service() self.cluster_id = cluster_id self._ensure_container() @@ -33,13 +34,13 @@ def save_cluster_config(self, cluster_config): blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE content = yaml.dump(cluster_config) container_name = cluster_config.cluster_id - self.blob_client.create_blob_from_text(container_name, blob_path, content) + self.block_blob_client.create_blob_from_text(container_name, blob_path, content) @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def read_cluster_config(self): blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE try: - result = self.blob_client.get_blob_to_text(self.cluster_id, blob_path) + result = self.block_blob_client.get_blob_to_text(self.cluster_id, blob_path) return yaml.load(result.content) except azure.common.AzureMissingResourceHttpError: raise error.AztkError("Cluster {} doesn't have cluster configuration in storage".format(self.cluster_id)) @@ -48,13 +49,13 @@ def read_cluster_config(self): @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def upload_file(self, blob_path: str, local_path: str) -> BlobData: - self.blob_client.create_blob_from_path(self.cluster_id, blob_path, local_path) - return BlobData(self.blob_client, self.cluster_id, blob_path) + self.block_blob_client.create_blob_from_path(self.cluster_id, blob_path, local_path) + return BlobData(self.block_blob_client, self.cluster_id, blob_path) @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def upload_bytes(self, blob_path: str, bytes_io: io.BytesIO) -> BlobData: - self.blob_client.create_blob_from_bytes(self.cluster_id, blob_path, bytes_io.getvalue()) - return BlobData(self.blob_client, self.cluster_id, blob_path) + self.block_blob_client.create_blob_from_bytes(self.cluster_id, blob_path, bytes_io.getvalue()) + return BlobData(self.block_blob_client, self.cluster_id, blob_path) def upload_cluster_file(self, blob_path: str, local_path: str) -> BlobData: blob_data = self.upload_bytes(self.CLUSTER_DIR + "/" + blob_path, local_path) @@ -71,8 +72,8 @@ def upload_node_data(self, node_data: NodeData) -> BlobData: @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def _ensure_container(self): - self.blob_client.create_container(self.cluster_id, fail_on_exist=False) + self.block_blob_client.create_container(self.cluster_id, fail_on_exist=False) @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def delete_container(self, container_name: str): - self.blob_client.delete_container(container_name) + self.block_blob_client.delete_container(container_name) diff --git a/aztk/node_scripts/core/config.py b/aztk/node_scripts/core/config.py index 7a1727c9..7766cb70 100644 --- a/aztk/node_scripts/core/config.py +++ b/aztk/node_scripts/core/config.py @@ -1,14 +1,6 @@ import os import re -import azure.batch.batch_auth as batchauth -import azure.batch.batch_service_client as batch -import azure.storage.blob as blob -from azure.common.credentials import ServicePrincipalCredentials -from azure.mgmt.batch import BatchManagementClient -from azure.mgmt.storage import StorageManagementClient -from azure.storage.common import CloudStorageAccount - from aztk.node_scripts.core import log from aztk.spark import Client, models @@ -40,41 +32,6 @@ storage_account_suffix = os.environ.get("STORAGE_ACCOUNT_SUFFIX") -def get_blob_client() -> blob.BlockBlobService: - if not storage_resource_id: - return blob.BlockBlobService( - account_name=storage_account_name, account_key=storage_account_key, endpoint_suffix=storage_account_suffix) - else: - credentials = ServicePrincipalCredentials( - client_id=client_id, secret=credential, tenant=tenant_id, resource="https://management.core.windows.net/") - m = RESOURCE_ID_PATTERN.match(storage_resource_id) - accountname = m.group("account") - subscription = m.group("subscription") - resourcegroup = m.group("resourcegroup") - mgmt_client = StorageManagementClient(credentials, subscription) - key = (mgmt_client.storage_accounts.list_keys(resource_group_name=resourcegroup, account_name=accountname) - .keys[0].value) - storage_client = CloudStorageAccount(accountname, key) - return storage_client.create_block_blob_service() - - -def get_batch_client() -> batch.BatchServiceClient: - if not batch_resource_id: - base_url = batch_service_url - credentials = batchauth.SharedKeyCredentials(batch_account_name, batch_account_key) - else: - credentials = ServicePrincipalCredentials( - client_id=client_id, secret=credential, tenant=tenant_id, resource="https://management.core.windows.net/") - m = RESOURCE_ID_PATTERN.match(batch_resource_id) - batch_client = BatchManagementClient(credentials, m.group("subscription")) - account = batch_client.batch_account.get(m.group("resourcegroup"), m.group("account")) - base_url = "https://%s/" % account.account_endpoint - credentials = ServicePrincipalCredentials( - client_id=client_id, secret=credential, tenant=tenant_id, resource="https://batch.core.windows.net/") - - return batch.BatchServiceClient(credentials, base_url=base_url) - - def get_spark_client(): if all([batch_resource_id, client_id, credential, storage_resource_id, tenant_id]): serice_principle_configuration = models.ServicePrincipalConfiguration( diff --git a/aztk/node_scripts/scheduling/common.py b/aztk/node_scripts/scheduling/common.py index 40593296..7ddace78 100644 --- a/aztk/node_scripts/scheduling/common.py +++ b/aztk/node_scripts/scheduling/common.py @@ -1,10 +1,15 @@ import datetime import os +import shlex +import subprocess +import sys import azure.batch.models as batch_models import azure.storage.blob as blob import requests import yaml +from azure.storage.common import CloudStorageAccount +from tests.integration_tests.spark.sdk.get_client import get_spark_client from aztk.node_scripts.core import config from aztk.node_scripts.scheduling import scheduling_target @@ -42,7 +47,7 @@ def upload_file_to_container(container_name, """ Uploads a local file to an Azure Blob storage container. :param blob_client: A blob service client. - :type blocblob_clientk_blob_client: `azure.storage.blob.BlockBlobService` + :type blob_client: `azure.storage.common.CloudStorageAccount` :param str container_name: The name of the Azure Blob storage container. :param str file_path: The local path to the file. :param str node_path: Path on the local node. By default will be the same as file_path @@ -50,6 +55,7 @@ def upload_file_to_container(container_name, :return: A ResourceFile initialized with a SAS URL appropriate for Batch tasks. """ + block_blob_client = blob_client.create_block_blob_service() file_path = file_path blob_name = None if use_full_path: @@ -61,18 +67,18 @@ def upload_file_to_container(container_name, if not node_path: node_path = blob_name - blob_client.create_container(container_name, fail_on_exist=False) + block_blob_client.create_container(container_name, fail_on_exist=False) - blob_client.create_blob_from_path(container_name, blob_path, file_path) + block_blob_client.create_blob_from_path(container_name, blob_path, file_path) - sas_token = blob_client.generate_blob_shared_access_signature( + sas_token = block_blob_client.generate_blob_shared_access_signature( container_name, blob_path, permission=blob.BlobPermissions.READ, expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7), ) - sas_url = blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) + sas_url = block_blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) return batch_models.ResourceFile(file_path=node_path, blob_source=sas_url) @@ -99,3 +105,33 @@ def download_task_definition(task_sas_url): response = scheduling_target.http_request_wrapper(requests.get, task_sas_url, timeout=10) yaml_serialized_task = response.content return yaml.load(yaml_serialized_task) + + +def stream_upload_to_storage( + blob_client: CloudStorageAccount, + stream, + application_name, +): + """ + Args: + blob_client (`azure.storage.common.CloudStorageAccount`) + stream (`obj:IOBase`): opened stream to upload as the blob content + """ + append_blob_client = blob_client.create_append_blob_service() + append_blob_client.create_blob( + container_name=os.environ["STORAGE_LOGS_CONTAINER"], + blob_name=application_name + "/output.log", + if_none_match="*", + ) + append_blob_client.append_blob_from_stream( + container_name=os.environ["STORAGE_LOGS_CONTAINER"], + blob_name=application_name + "/output.log", + stream=stream, + ) + + +def run_command(spark_client, command, application_name): + process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE) + stream_upload_to_storage(spark_client.blob_client, process.stdout, application_name) + rc = process.poll() + return rc diff --git a/aztk/node_scripts/scheduling/submit.py b/aztk/node_scripts/scheduling/submit.py index f6eb26ed..b435997d 100644 --- a/aztk/node_scripts/scheduling/submit.py +++ b/aztk/node_scripts/scheduling/submit.py @@ -86,8 +86,8 @@ def ssh_submit(task_sas_url): # update task table before running task = insert_task_into_task_table(aztk_cluster_id, task_definition) # run task and upload log - exit_code = subprocess.call(cmd.to_str(), shell=True) - common.upload_log(config.blob_client, application) + exit_code = common.run_command(config.spark_client, cmd.to_str(), application.application_name) + # common.upload_log(config.blob_client, application) #TODO: enable logging # print("completed application, updating storage table") mark_task_complete(aztk_cluster_id, task.id, exit_code) diff --git a/aztk/utils/azure_api.py b/aztk/utils/azure_api.py index 7f49745a..3b4f1f4a 100644 --- a/aztk/utils/azure_api.py +++ b/aztk/utils/azure_api.py @@ -108,9 +108,8 @@ def make_blob_client(secrets): account_name=accountname, ).keys[0].value) storage_client = CloudStorageAccount(accountname, key) - blob_client = storage_client.create_block_blob_service() - return blob_client + return storage_client def make_table_service(secrets): From 68bfaa88cf2c65cf01b70bf7a37a1b156510ea22 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 1 Nov 2018 17:14:01 -0700 Subject: [PATCH 02/28] fix some bugs --- aztk/node_scripts/scheduling/common.py | 1 - aztk/utils/helpers.py | 22 +++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/aztk/node_scripts/scheduling/common.py b/aztk/node_scripts/scheduling/common.py index 7ddace78..528810a7 100644 --- a/aztk/node_scripts/scheduling/common.py +++ b/aztk/node_scripts/scheduling/common.py @@ -9,7 +9,6 @@ import requests import yaml from azure.storage.common import CloudStorageAccount -from tests.integration_tests.spark.sdk.get_client import get_spark_client from aztk.node_scripts.core import config from aztk.node_scripts.scheduling import scheduling_target diff --git a/aztk/utils/helpers.py b/aztk/utils/helpers.py index 461c3756..0c213ffa 100644 --- a/aztk/utils/helpers.py +++ b/aztk/utils/helpers.py @@ -65,19 +65,21 @@ def wait_for_task_to_complete(job_id: str, task_id: str, batch_client): def upload_text_to_container(container_name: str, application_name: str, content: str, file_path: str, blob_client=None) -> batch_models.ResourceFile: + block_blob_client = blob_client.create_block_blob_service() + blob_name = file_path blob_path = application_name + "/" + blob_name # + '/' + time_stamp + '/' + blob_name - blob_client.create_container(container_name, fail_on_exist=False) - blob_client.create_blob_from_text(container_name, blob_path, content) + block_blob_client.create_container(container_name, fail_on_exist=False) + block_blob_client.create_blob_from_text(container_name, blob_path, content) - sas_token = blob_client.generate_blob_shared_access_signature( + sas_token = block_blob_client.generate_blob_shared_access_signature( container_name, blob_path, permission=blob.BlobPermissions.READ, expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365), ) - sas_url = blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) + sas_url = block_blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) return batch_models.ResourceFile(file_path=blob_name, blob_source=sas_url) @@ -91,7 +93,7 @@ def upload_file_to_container(container_name, """ Uploads a local file to an Azure Blob storage container. :param blob_client: A blob service client. - :type blocblob_clientk_blob_client: `azure.storage.blob.BlockBlobService` + :type blob_client: `azure.storage.common.CloudStorageAccount` :param str container_name: The name of the Azure Blob storage container. :param str file_path: The local path to the file. :param str node_path: Path on the local node. By default will be the same as file_path @@ -99,6 +101,8 @@ def upload_file_to_container(container_name, :return: A ResourceFile initialized with a SAS URL appropriate for Batch tasks. """ + block_blob_client = blob_client.create_block_blob_service() + file_path = normalize_path(file_path) blob_name = None if use_full_path: @@ -110,18 +114,18 @@ def upload_file_to_container(container_name, if not node_path: node_path = blob_name - blob_client.create_container(container_name, fail_on_exist=False) + block_blob_client.create_container(container_name, fail_on_exist=False) - blob_client.create_blob_from_path(container_name, blob_path, file_path) + block_blob_client.create_blob_from_path(container_name, blob_path, file_path) - sas_token = blob_client.generate_blob_shared_access_signature( + sas_token = block_blob_client.generate_blob_shared_access_signature( container_name, blob_path, permission=blob.BlobPermissions.READ, expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7), ) - sas_url = blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) + sas_url = block_blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) return batch_models.ResourceFile(file_path=node_path, blob_source=sas_url) From f43fe740ef0785821a4e77f2c08fe5d6505f874f Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 2 Nov 2018 11:04:43 -0700 Subject: [PATCH 03/28] typo in function name --- aztk/client/base/helpers/get_application_log.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index dc8ffc8d..96ba3c25 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -59,7 +59,7 @@ def get_log_from_storage(blob_client, container_name, application_name, task): """ try: - block_blob_client = blob_client.create_block_blob_client() + block_blob_client = blob_client.create_block_blob_service() blob = block_blob_client.get_blob_to_text(container_name, application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE) except azure.common.AzureMissingResourceHttpError: From c0f26395c8b2083b1d8dd74a363b6b443504bc4a Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 2 Nov 2018 14:34:05 -0700 Subject: [PATCH 04/28] refactor some functions, monkeypatch max_block_size --- aztk/node_scripts/scheduling/common.py | 7 ++- .../scheduling/scheduling_target.py | 50 ++++++++++++++++ aztk/node_scripts/scheduling/submit.py | 58 ++----------------- examples/src/main/python/pi.py | 5 +- 4 files changed, 63 insertions(+), 57 deletions(-) diff --git a/aztk/node_scripts/scheduling/common.py b/aztk/node_scripts/scheduling/common.py index 528810a7..6fc9ad50 100644 --- a/aztk/node_scripts/scheduling/common.py +++ b/aztk/node_scripts/scheduling/common.py @@ -116,7 +116,11 @@ def stream_upload_to_storage( blob_client (`azure.storage.common.CloudStorageAccount`) stream (`obj:IOBase`): opened stream to upload as the blob content """ + from azure.storage.blob.appendblobservice import AppendBlobService + AppendBlobService.MAX_BLOCK_SIZE = 10 * 1024 + append_blob_client = blob_client.create_append_blob_service() + append_blob_client.create_blob( container_name=os.environ["STORAGE_LOGS_CONTAINER"], blob_name=application_name + "/output.log", @@ -130,7 +134,8 @@ def stream_upload_to_storage( def run_command(spark_client, command, application_name): - process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE) + print("running common.py run_command") + process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stream_upload_to_storage(spark_client.blob_client, process.stdout, application_name) rc = process.poll() return rc diff --git a/aztk/node_scripts/scheduling/scheduling_target.py b/aztk/node_scripts/scheduling/scheduling_target.py index 2500ee44..602561e4 100644 --- a/aztk/node_scripts/scheduling/scheduling_target.py +++ b/aztk/node_scripts/scheduling/scheduling_target.py @@ -1,10 +1,13 @@ import concurrent.futures +import datetime import os import time import requests from aztk import error +from aztk.models import Task, TaskState +from aztk.node_scripts.core import config def http_request_wrapper(func, *args, timeout=None, max_execution_time=300, **kwargs): @@ -51,3 +54,50 @@ def download_task_resource_files(task_id, resource_files): raise error.AztkError(errors) else: return [result.result() for result in done] + + +def insert_task_into_task_table(cluster_id, task_definition): + current_time = datetime.datetime.utcnow() + task = Task( + id=task_definition.id, + node_id=os.environ.get("AZ_BATCH_NODE_ID", None), + state=TaskState.Running, + state_transition_time=current_time, + command_line=task_definition.command_line, + start_time=current_time, + end_time=None, + exit_code=None, + failure_info=None, + ) + + config.spark_client.cluster._core_cluster_operations.insert_task_into_task_table(cluster_id, task) + return task + + +def get_task(cluster_id, task_id): + return config.spark_client.cluster._core_cluster_operations.get_task_from_table(cluster_id, task_id) + + +def mark_task_complete(cluster_id, task_id, exit_code): + current_time = datetime.datetime.utcnow() + + task = get_task(cluster_id, task_id) + task.end_time = current_time + task.exit_code = exit_code + task.state = TaskState.Completed + task.state_transition_time = current_time + + config.spark_client.cluster._core_cluster_operations.update_task_in_task_table(cluster_id, task) + + +def mark_task_failure(cluster_id, task_id, exit_code, failure_info): + current_time = datetime.datetime.utcnow() + + task = get_task(cluster_id, task_id) + task.end_time = current_time + task.exit_code = exit_code + task.state = TaskState.Failed + task.state_transition_time = current_time + task.failure_info = failure_info + + config.spark_client.cluster._core_cluster_operations.update_task_in_task_table(cluster_id, task) diff --git a/aztk/node_scripts/scheduling/submit.py b/aztk/node_scripts/scheduling/submit.py index b435997d..8e08518e 100644 --- a/aztk/node_scripts/scheduling/submit.py +++ b/aztk/node_scripts/scheduling/submit.py @@ -1,10 +1,8 @@ -import datetime import logging import os import subprocess import sys -from aztk.models import Task, TaskState from aztk.node_scripts.core import config from aztk.node_scripts.scheduling import common, scheduling_target from aztk.utils.command_builder import CommandBuilder @@ -84,68 +82,22 @@ def ssh_submit(task_sas_url): aztk_cluster_id = os.environ.get("AZTK_CLUSTER_ID") try: # update task table before running - task = insert_task_into_task_table(aztk_cluster_id, task_definition) + task = scheduling_target.insert_task_into_task_table(aztk_cluster_id, task_definition) # run task and upload log - exit_code = common.run_command(config.spark_client, cmd.to_str(), application.application_name) + exit_code = common.run_command(config.spark_client, cmd.to_str(), application.name) # common.upload_log(config.blob_client, application) #TODO: enable logging # print("completed application, updating storage table") - mark_task_complete(aztk_cluster_id, task.id, exit_code) + scheduling_target.mark_task_complete(aztk_cluster_id, task.id, exit_code) except Exception as e: #TODO: enable logging # print("application failed, updating storage table") - mark_task_failure(aztk_cluster_id, task_definition.id, exit_code, str(e)) + import traceback + scheduling_target.mark_task_failure(aztk_cluster_id, task_definition.id, exit_code, traceback.format_exc(e)) return exit_code -def insert_task_into_task_table(cluster_id, task_definition): - current_time = datetime.datetime.utcnow() - task = Task( - id=task_definition.id, - node_id=os.environ.get("AZ_BATCH_NODE_ID", None), - state=TaskState.Running, - state_transition_time=current_time, - command_line=task_definition.command_line, - start_time=current_time, - end_time=None, - exit_code=None, - failure_info=None, - ) - - config.spark_client.cluster._core_cluster_operations.insert_task_into_task_table(cluster_id, task) - return task - - -def get_task(cluster_id, task_id): - return config.spark_client.cluster._core_cluster_operations.get_task_from_table(cluster_id, task_id) - - -def mark_task_complete(cluster_id, task_id, exit_code): - current_time = datetime.datetime.utcnow() - - task = get_task(cluster_id, task_id) - task.end_time = current_time - task.exit_code = exit_code - task.state = TaskState.Completed - task.state_transition_time = current_time - - config.spark_client.cluster._core_cluster_operations.update_task_in_task_table(cluster_id, task) - - -def mark_task_failure(cluster_id, task_id, exit_code, failure_info): - current_time = datetime.datetime.utcnow() - - task = get_task(cluster_id, task_id) - task.end_time = current_time - task.exit_code = exit_code - task.state = TaskState.Failed - task.state_transition_time = current_time - task.failure_info = failure_info - - config.spark_client.cluster._core_cluster_operations.update_task_in_task_table(cluster_id, task) - - if __name__ == "__main__": exit_code = 1 diff --git a/examples/src/main/python/pi.py b/examples/src/main/python/pi.py index 5839cc28..1b05a8e3 100755 --- a/examples/src/main/python/pi.py +++ b/examples/src/main/python/pi.py @@ -18,12 +18,11 @@ from __future__ import print_function import sys -from random import random from operator import add +from random import random from pyspark.sql import SparkSession - if __name__ == "__main__": """ Usage: pi [partitions] @@ -39,7 +38,7 @@ def f(_): x = random() * 2 - 1 y = random() * 2 - 1 - return 1 if x ** 2 + y ** 2 <= 1 else 0 + return 1 if x**2 + y**2 <= 1 else 0 count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add) print("Pi is roughly %f" % (4.0 * count / n)) From 0b4bdf39f9f82ddb5145b66354d4a3b13c1ef146 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 2 Nov 2018 14:37:42 -0700 Subject: [PATCH 05/28] docstring, fix bad traceback call --- aztk/node_scripts/scheduling/common.py | 2 +- aztk/node_scripts/scheduling/submit.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aztk/node_scripts/scheduling/common.py b/aztk/node_scripts/scheduling/common.py index 6fc9ad50..3d5ba941 100644 --- a/aztk/node_scripts/scheduling/common.py +++ b/aztk/node_scripts/scheduling/common.py @@ -115,6 +115,7 @@ def stream_upload_to_storage( Args: blob_client (`azure.storage.common.CloudStorageAccount`) stream (`obj:IOBase`): opened stream to upload as the blob content + application_name (`str`): the name of the application to uploads logs for """ from azure.storage.blob.appendblobservice import AppendBlobService AppendBlobService.MAX_BLOCK_SIZE = 10 * 1024 @@ -134,7 +135,6 @@ def stream_upload_to_storage( def run_command(spark_client, command, application_name): - print("running common.py run_command") process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stream_upload_to_storage(spark_client.blob_client, process.stdout, application_name) rc = process.poll() diff --git a/aztk/node_scripts/scheduling/submit.py b/aztk/node_scripts/scheduling/submit.py index 8e08518e..2ce35453 100644 --- a/aztk/node_scripts/scheduling/submit.py +++ b/aztk/node_scripts/scheduling/submit.py @@ -93,7 +93,7 @@ def ssh_submit(task_sas_url): #TODO: enable logging # print("application failed, updating storage table") import traceback - scheduling_target.mark_task_failure(aztk_cluster_id, task_definition.id, exit_code, traceback.format_exc(e)) + scheduling_target.mark_task_failure(aztk_cluster_id, task_definition.id, exit_code, traceback.format_exc()) return exit_code From 2d2223707f259f9feff9487591021f781fab90e8 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 2 Nov 2018 15:52:36 -0700 Subject: [PATCH 06/28] start shift from print to logger --- aztk/node_scripts/install/create_user.py | 14 +++++++++----- aztk/node_scripts/install/install.py | 13 +++++++------ aztk/node_scripts/install/pick_master.py | 12 ++++++------ aztk/node_scripts/scheduling/submit.py | 18 ++++++------------ 4 files changed, 28 insertions(+), 29 deletions(-) diff --git a/aztk/node_scripts/install/create_user.py b/aztk/node_scripts/install/create_user.py index 650f0e26..12e63523 100644 --- a/aztk/node_scripts/install/create_user.py +++ b/aztk/node_scripts/install/create_user.py @@ -1,10 +1,13 @@ import os +from datetime import datetime, timedelta, timezone + import azure.batch.models as batch_models +import yaml from azure.batch.models import BatchErrorException -from Cryptodome.PublicKey import RSA from Cryptodome.Cipher import AES, PKCS1_OAEP -from datetime import datetime, timezone, timedelta -import yaml +from Cryptodome.PublicKey import RSA + +from aztk.node_scripts.core import log """ Creates a user if the user configuration file at $AZTK_WORKING_DIR/user.yaml exists """ @@ -14,7 +17,7 @@ def create_user(batch_client): path = os.path.join(os.environ["AZTK_WORKING_DIR"], "user.yaml") if not os.path.isfile(path): - print("No user to create.") + log.info("No user to create.") return with open(path, "r", encoding="UTF-8") as file: @@ -35,7 +38,8 @@ def create_user(batch_client): ), ) except BatchErrorException as e: - print(e) + import traceback + log.info(traceback.format_exc()) def decrypt_password(user_conf): diff --git a/aztk/node_scripts/install/install.py b/aztk/node_scripts/install/install.py index 11ade3be..e3b7f30b 100644 --- a/aztk/node_scripts/install/install.py +++ b/aztk/node_scripts/install/install.py @@ -3,14 +3,15 @@ from aztk.internal import cluster_data from aztk.models.plugins import PluginTarget from aztk.node_scripts import wait_until_master_selected -from aztk.node_scripts.core import config -from aztk.node_scripts.install import (create_user, pick_master, plugins, spark, spark_container) +from aztk.node_scripts.core import config, log +from aztk.node_scripts.install import (create_user, pick_master, plugins, + spark, spark_container) def read_cluster_config(): data = cluster_data.ClusterData(config.blob_client, config.cluster_id) cluster_config = data.read_cluster_config() - print("Got cluster config", cluster_config) + log.info("Got cluster config: %s", cluster_config) return cluster_config @@ -62,11 +63,11 @@ def setup_spark_container(): """ is_master = os.environ.get("AZTK_IS_MASTER") == "true" is_worker = os.environ.get("AZTK_IS_WORKER") == "true" - print("Setting spark container. Master: ", is_master, ", Worker: ", is_worker) + log.info("Setting spark container. Master: %s, Worker: %s", is_master, is_worker) - print("Copying spark setup config") + log.info("Copying spark setup config") spark.setup_conf() - print("Done copying spark setup config") + log.info("Done copying spark setup config") spark.setup_connection() diff --git a/aztk/node_scripts/install/pick_master.py b/aztk/node_scripts/install/pick_master.py index a02f9a13..5046191e 100644 --- a/aztk/node_scripts/install/pick_master.py +++ b/aztk/node_scripts/install/pick_master.py @@ -6,7 +6,7 @@ from azure.batch.models import BatchErrorException from msrest.exceptions import ClientRequestError -from aztk.node_scripts.core import config +from aztk.node_scripts.core import config, log MASTER_NODE_METADATA_KEY = "_spark_master_node" @@ -41,7 +41,7 @@ def try_assign_self_as_master(client: batch.BatchServiceClient, pool: batchmodel ) return True except (BatchErrorException, ClientRequestError): - print("Couldn't assign itself as master the pool because the pool was modified since last get.") + log.info("Couldn't assign itself as master the pool because the pool was modified since last get.") return False @@ -61,17 +61,17 @@ def find_master(client: batch.BatchServiceClient) -> bool: if master: if master == config.node_id: - print("Node is already the master '{0}'".format(master)) + log.info("Node is already the master '{0}'".format(master)) return True else: - print("Pool already has a master '{0}'. This node will be a worker".format(master)) + log.info("Pool already has a master '{0}'. This node will be a worker".format(master)) return False else: - print("Pool has no master. Trying to assign itself! ({0}/5)".format(i + 1)) + log.info("Pool has no master. Trying to assign itself! ({0}/5)".format(i + 1)) result = try_assign_self_as_master(client, pool) if result: - print("Assignment was successful! Node {0} is the new master.".format(config.node_id)) + log.info("Assignment was successful! Node {0} is the new master.".format(config.node_id)) return True raise CannotAllocateMasterError("Unable to assign node as a master in 5 tries") diff --git a/aztk/node_scripts/scheduling/submit.py b/aztk/node_scripts/scheduling/submit.py index 2ce35453..865a6239 100644 --- a/aztk/node_scripts/scheduling/submit.py +++ b/aztk/node_scripts/scheduling/submit.py @@ -3,7 +3,7 @@ import subprocess import sys -from aztk.node_scripts.core import config +from aztk.node_scripts.core import config, log from aztk.node_scripts.scheduling import common, scheduling_target from aztk.utils.command_builder import CommandBuilder @@ -47,9 +47,7 @@ def __app_submit_cmd(application): os.path.expandvars(application.application) + " " + " ".join(["'" + str(app_arg) + "'" for app_arg in (application.application_args or [])])) - with open("spark-submit.txt", mode="w", encoding="UTF-8") as stream: - stream.write(spark_submit_cmd.to_str()) - + log.info("Spark submit cmd: %s", spark_submit_cmd.to_str()) return spark_submit_cmd @@ -63,8 +61,7 @@ def receive_submit_request(application_file_path): cmd = __app_submit_cmd(application) exit_code = -1 try: - exit_code = subprocess.call(cmd.to_str(), shell=True) - common.upload_log(blob_client, application) + exit_code = common.run_command(config.spark_client, cmd.to_str(), application.name) except Exception as e: common.upload_error_log(str(e), os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml")) return exit_code @@ -85,13 +82,10 @@ def ssh_submit(task_sas_url): task = scheduling_target.insert_task_into_task_table(aztk_cluster_id, task_definition) # run task and upload log exit_code = common.run_command(config.spark_client, cmd.to_str(), application.name) - # common.upload_log(config.blob_client, application) - #TODO: enable logging - # print("completed application, updating storage table") + log("completed application, updating storage table") scheduling_target.mark_task_complete(aztk_cluster_id, task.id, exit_code) except Exception as e: - #TODO: enable logging - # print("application failed, updating storage table") + log("application failed, updating storage table") import traceback scheduling_target.mark_task_failure(aztk_cluster_id, task_definition.id, exit_code, traceback.format_exc()) @@ -112,5 +106,5 @@ def ssh_submit(task_sas_url): os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml")) else: exit_code = receive_submit_request(os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml")) - # print("exit code", exit_code) + log.info("Exit code: %s", str(exit_code)) sys.exit(exit_code) From 6d56fec66ecf7f6440b97361aad58fac3114f3c3 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 2 Nov 2018 16:07:01 -0700 Subject: [PATCH 07/28] add todo --- aztk/client/base/helpers/get_application_log.py | 1 + aztk/node_scripts/core/logger.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index 96ba3c25..cd8ae596 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -48,6 +48,7 @@ def __get_output_file_properties(batch_client, cluster_id: str, application_name raise e +# TODO: stream log from storage def get_log_from_storage(blob_client, container_name, application_name, task): """ Args: diff --git a/aztk/node_scripts/core/logger.py b/aztk/node_scripts/core/logger.py index dab61457..d0fc4b90 100644 --- a/aztk/node_scripts/core/logger.py +++ b/aztk/node_scripts/core/logger.py @@ -1,9 +1,10 @@ -import sys import logging +import sys log = logging.getLogger("aztk.node-agent") DEFAULT_FORMAT = "%(message)s" +VERBOSE_FORMAT = "[%(asctime)s] [%(filename)s:%(module)s:%(funcName)s:%(lineno)d] %(levelname)s - %(message)s" def setup_logging(): @@ -11,4 +12,4 @@ def setup_logging(): logging.root.removeHandler(handler) log.setLevel(logging.INFO) - logging.basicConfig(stream=sys.stdout, format=DEFAULT_FORMAT) + logging.basicConfig(stream=sys.stdout, format=VERBOSE_FORMAT) From 2118f7a00a52199cc411a8573c51bfd1f79cae5d Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 2 Nov 2018 16:12:21 -0700 Subject: [PATCH 08/28] yapf --- aztk/node_scripts/install/install.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/aztk/node_scripts/install/install.py b/aztk/node_scripts/install/install.py index e3b7f30b..0488a688 100644 --- a/aztk/node_scripts/install/install.py +++ b/aztk/node_scripts/install/install.py @@ -4,8 +4,7 @@ from aztk.models.plugins import PluginTarget from aztk.node_scripts import wait_until_master_selected from aztk.node_scripts.core import config, log -from aztk.node_scripts.install import (create_user, pick_master, plugins, - spark, spark_container) +from aztk.node_scripts.install import (create_user, pick_master, plugins, spark, spark_container) def read_cluster_config(): From 53a6f902d3f2031e74c1a1c99a438af6c90d6ff2 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 2 Nov 2018 16:28:55 -0700 Subject: [PATCH 09/28] add comments about planned implementation --- .../base/helpers/get_application_log.py | 45 ++++++------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index cd8ae596..30582d00 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -49,6 +49,18 @@ def __get_output_file_properties(batch_client, cluster_id: str, application_name # TODO: stream log from storage +# TODO: find a way to flush memory or return a generator for blob content in Application +''' +def stream_log_from_storage(): # for the cli + last_read_byte = 0 + while task not completed: + blob = get_blob_to_text(start_range=last_read_byte) + print(blob.content, end='') + last_read_byte = last_read_byte + blob.properties.content_length + +''' + + def get_log_from_storage(blob_client, container_name, application_name, task): """ Args: @@ -57,7 +69,7 @@ def get_log_from_storage(blob_client, container_name, application_name, task): container_name (:obj:`str`): the name of the Azure Blob storage container to get data from application_name (:obj:`str`): the name of the application to get logs for task (:obj:`aztk.models.Task`): the aztk task for for this application - + """ try: block_blob_client = blob_client.create_block_blob_service() @@ -99,37 +111,6 @@ def get_log(base_operations, cluster_id: str, application_name: str, tail=False, if not __check_task_node_exist(base_operations.batch_client, cluster_id, task): return get_log_from_storage(base_operations.blob_client, cluster_id, application_name, task) - file = __get_output_file_properties(base_operations.batch_client, cluster_id, application_name) - target_bytes = file.content_length - - if target_bytes != current_bytes: - ocp_range = None - - if tail: - ocp_range = "bytes={0}-{1}".format(current_bytes, target_bytes - 1) - - stream = base_operations.batch_client.file.get_from_task( - job_id, task_id, output_file, batch_models.FileGetFromTaskOptions(ocp_range=ocp_range)) - content = helpers.read_stream_as_string(stream) - - return models.ApplicationLog( - name=application_name, - cluster_id=cluster_id, - application_state=task.state, - log=content, - total_bytes=target_bytes, - exit_code=task.exit_code, - ) - else: - return models.ApplicationLog( - name=application_name, - cluster_id=cluster_id, - application_state=task.state, - log="", - total_bytes=target_bytes, - exit_code=task.exit_code, - ) - def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): try: From ff9ddee2fff1c1814842da8eb9dbb8352fe3f3da Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 5 Nov 2018 13:56:59 -0800 Subject: [PATCH 10/28] create batch error context manager, start stream log download impl --- .../base/helpers/get_application_log.py | 142 ++++++++++-------- aztk/client/base/helpers/get_recent_job.py | 9 +- .../base/helpers/get_remote_login_settings.py | 10 +- aztk/client/base/helpers/get_task_state.py | 11 +- aztk/client/base/helpers/run.py | 9 +- aztk/client/cluster/helpers/copy.py | 10 +- .../client/base/helpers/list_applications.py | 10 +- aztk/spark/client/cluster/helpers/copy.py | 9 +- aztk/spark/client/cluster/helpers/create.py | 9 +- .../client/cluster/helpers/create_user.py | 8 +- aztk/spark/client/cluster/helpers/delete.py | 9 +- .../client/cluster/helpers/diagnostics.py | 8 +- aztk/spark/client/cluster/helpers/download.py | 9 +- aztk/spark/client/cluster/helpers/get.py | 9 +- .../cluster/helpers/get_application_state.py | 9 +- .../cluster/helpers/get_configuration.py | 9 +- .../helpers/get_remote_login_settings.py | 9 +- aztk/spark/client/cluster/helpers/list.py | 9 +- aztk/spark/client/cluster/helpers/node_run.py | 9 +- aztk/spark/client/cluster/helpers/run.py | 9 +- .../client/cluster/helpers/ssh_into_master.py | 9 +- aztk/spark/client/cluster/helpers/submit.py | 9 +- aztk/spark/client/cluster/helpers/wait.py | 5 +- aztk/spark/client/job/helpers/delete.py | 5 +- aztk/spark/client/job/helpers/get.py | 6 +- .../client/job/helpers/get_application.py | 5 +- .../client/job/helpers/get_application_log.py | 5 +- aztk/spark/client/job/helpers/list.py | 5 +- .../client/job/helpers/list_applications.py | 5 +- aztk/spark/client/job/helpers/stop.py | 9 +- aztk/spark/client/job/helpers/submit.py | 8 +- .../client/job/helpers/wait_until_complete.py | 8 +- aztk/utils/__init__.py | 1 + aztk/utils/batch_error_manager.py | 14 ++ 34 files changed, 161 insertions(+), 249 deletions(-) create mode 100644 aztk/utils/batch_error_manager.py diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index 30582d00..51a2e46a 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -1,25 +1,19 @@ +import tempfile import time import azure import azure.batch.models as batch_models -from azure.batch.models import BatchErrorException from aztk import error, models from aztk.models import Task, TaskState -from aztk.utils import constants, helpers +from aztk.utils import batch_error_manager, constants -output_file = constants.TASK_WORKING_DIR + "/" + constants.SPARK_SUBMIT_LOGS_FILE - -def __check_task_node_exist(batch_client, cluster_id: str, task: Task) -> bool: - try: - batch_client.compute_node.get(cluster_id, task.node_id) - return True - except BatchErrorException: - return False +def convert_application_name_to_blob_path(application_name): + return application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE -def __wait_for_app_to_be_running(base_operations, cluster_id: str, application_name: str) -> Task: +def wait_for_batch_task(base_operations, cluster_id: str, application_name: str) -> Task: """ Wait for the batch task to leave the waiting state into running(or completed if it was fast enough) """ @@ -34,86 +28,106 @@ def __wait_for_app_to_be_running(base_operations, cluster_id: str, application_n return base_operations.get_batch_task(id=cluster_id, task_id=application_name) -def __get_output_file_properties(batch_client, cluster_id: str, application_name: str): - while True: - try: - file = helpers.get_file_properties(cluster_id, application_name, output_file, batch_client) - return file - except BatchErrorException as e: - if e.response.status_code == 404: - # TODO: log - time.sleep(5) - continue - else: - raise e - - -# TODO: stream log from storage -# TODO: find a way to flush memory or return a generator for blob content in Application -''' -def stream_log_from_storage(): # for the cli - last_read_byte = 0 - while task not completed: - blob = get_blob_to_text(start_range=last_read_byte) - print(blob.content, end='') - last_read_byte = last_read_byte + blob.properties.content_length +def wait_for_scheduling_target_task(base_operations, cluster_id, application_name): + # TODO: ensure get_task_state not None or throw + task = base_operations.get_task_from_table(cluster_id, application_name) + while task.state not in [TaskState.Completed, TaskState.Failed, Task.Running]: + time.sleep(3) + # TODO: enable logger + # log.debug("{} {}: application not yet complete".format(cluster_id, application_name)) + task = base_operations.get_task_from_table(cluster_id, application_name) + return task + + +def get_blob_from_storage(block_blob_client, container_name, application_name, stream, start_range): + try: + return block_blob_client.get_blob_to_stream( + container_name, + convert_application_name_to_blob_path(application_name), + stream, + start_range=start_range, + ) + except azure.common.AzureMissingResourceHttpError: + raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") -''' +def get_log_from_storage(blob_client, container_name, application_name, task, current_bytes): + stream = tempfile.SpooledTemporaryFile(max_size=2 * 1024 * 1024) + blob = get_blob_from_storage(blob_client.create_block_blob_service, container_name, application_name, stream, + current_bytes) + return models.ApplicationLog( + name=application_name, + cluster_id=container_name, + application_state=task.state, + log=stream, + total_bytes=blob.properties.content_length, + exit_code=task.exit_code, + ) -def get_log_from_storage(blob_client, container_name, application_name, task): + +def stream_log_from_storage(base_operations, container_name, application_name, task): """ Args: - block_blob_client (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the Azure Storage - Blob service. + base_operations (:obj:`aztk.client.base.BaseOperations`): Base aztk client container_name (:obj:`str`): the name of the Azure Blob storage container to get data from application_name (:obj:`str`): the name of the application to get logs for task (:obj:`aztk.models.Task`): the aztk task for for this application - """ - try: - block_blob_client = blob_client.create_block_blob_service() - blob = block_blob_client.get_blob_to_text(container_name, - application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE) - except azure.common.AzureMissingResourceHttpError: - raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") + stream = tempfile.SpooledTemporaryFile(max_size=2 * 1024 * 1024) + last_read_byte = 0 + + block_blob_client = base_operations.blob_client.create_block_blob_service() + blob = get_blob_from_storage( + block_blob_client, + container_name, + convert_application_name_to_blob_path(application_name), + stream, + start_range=last_read_byte, + ) + + while task.state not in [TaskState.Completed, TaskState.Failed]: + task = base_operations.get_task_from_table(task.id, application_name) #TODO: is this a race condiition? + last_read_byte = blob.properties.content_length + blob = get_blob_from_storage( + block_blob_client, + container_name, + convert_application_name_to_blob_path(application_name), + stream, + start_range=last_read_byte, + ) return models.ApplicationLog( name=application_name, cluster_id=container_name, application_state=task.state, - log=blob.content, + log=stream, total_bytes=blob.properties.content_length, exit_code=task.exit_code, ) -def wait_for_scheduling_target_task(base_operations, cluster_id, application_name): - application_state = base_operations.get_task_state(cluster_id, application_name) - while TaskState(application_state) not in [TaskState.Completed, TaskState.Failed]: - time.sleep(3) - # TODO: enable logger - # log.debug("{} {}: application not yet complete".format(cluster_id, application_name)) - application_state = base_operations.get_task_state(cluster_id, application_name) - return base_operations.get_task_from_table(cluster_id, application_name) +def get_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + cluster_configuration = base_operations.get_cluster_configuration(cluster_id) + + if cluster_configuration.scheduling_target is not models.SchedulingTarget.Any: + task = wait_for_scheduling_target_task(base_operations, cluster_id, application_name) + else: + task = wait_for_batch_task(base_operations, cluster_id, application_name) + return get_log_from_storage(base_operations.blob_client, cluster_id, application_name, task, current_bytes) -def get_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - job_id = cluster_id - task_id = application_name + +def stream_log(base_operations, cluster_id: str, application_name: str): cluster_configuration = base_operations.get_cluster_configuration(cluster_id) if cluster_configuration.scheduling_target is not models.SchedulingTarget.Any: task = wait_for_scheduling_target_task(base_operations, cluster_id, application_name) - return get_log_from_storage(base_operations.blob_client, cluster_id, application_name, task) else: - task = __wait_for_app_to_be_running(base_operations, cluster_id, application_name) - if not __check_task_node_exist(base_operations.batch_client, cluster_id, task): - return get_log_from_storage(base_operations.blob_client, cluster_id, application_name, task) + task = wait_for_batch_task(base_operations, cluster_id, application_name) + + return stream_log_from_storage(base_operations, cluster_id, application_name, task) def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - try: + with batch_error_manager(): return get_log(base_operations, cluster_id, application_name, tail, current_bytes) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/client/base/helpers/get_recent_job.py b/aztk/client/base/helpers/get_recent_job.py index 9ac089ee..9dd4e945 100644 --- a/aztk/client/base/helpers/get_recent_job.py +++ b/aztk/client/base/helpers/get_recent_job.py @@ -1,14 +1,9 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager # Note: this only works with jobs, not clusters # cluster impl is planned to change to job schedule def get_recent_job(core_job_operations, id): - try: + with batch_error_manager(): job_schedule = core_job_operations.batch_client.job_schedule.get(id) return core_job_operations.batch_client.job.get(job_schedule.execution_info.recent_job.id) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/client/base/helpers/get_remote_login_settings.py b/aztk/client/base/helpers/get_remote_login_settings.py index 95e13a25..f24e2004 100644 --- a/aztk/client/base/helpers/get_remote_login_settings.py +++ b/aztk/client/base/helpers/get_remote_login_settings.py @@ -1,7 +1,5 @@ -from azure.batch.models import BatchErrorException - -from aztk import error, models -from aztk.utils import helpers +from aztk import models +from aztk.utils import batch_error_manager def _get_remote_login_settings(base_client, pool_id: str, node_id: str): @@ -16,7 +14,5 @@ def _get_remote_login_settings(base_client, pool_id: str, node_id: str): def get_remote_login_settings(base_client, cluster_id: str, node_id: str): - try: + with batch_error_manager(): return _get_remote_login_settings(base_client, cluster_id, node_id) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/client/base/helpers/get_task_state.py b/aztk/client/base/helpers/get_task_state.py index 98074a4e..9692587e 100644 --- a/aztk/client/base/helpers/get_task_state.py +++ b/aztk/client/base/helpers/get_task_state.py @@ -1,12 +1,9 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.models import SchedulingTarget, TaskState -from aztk.utils import helpers +from aztk.models import SchedulingTarget +from aztk.utils import batch_error_manager def get_task_state(core_cluster_operations, cluster_id: str, task_id: str): - try: + with batch_error_manager(): scheduling_target = core_cluster_operations.get_cluster_configuration(cluster_id).scheduling_target if scheduling_target is not SchedulingTarget.Any: task = core_cluster_operations.get_task_from_table(cluster_id, task_id) @@ -14,5 +11,3 @@ def get_task_state(core_cluster_operations, cluster_id: str, task_id: str): else: task = core_cluster_operations.get_batch_task(cluster_id, task_id) return task.state - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/client/base/helpers/run.py b/aztk/client/base/helpers/run.py index c82b9898..3157e26b 100644 --- a/aztk/client/base/helpers/run.py +++ b/aztk/client/base/helpers/run.py @@ -1,11 +1,8 @@ import asyncio -from azure.batch.models import BatchErrorException - import aztk.models as models -from aztk import error +from aztk.utils import batch_error_manager from aztk.utils import ssh as ssh_lib -from aztk.utils import helpers def cluster_run(base_operations, cluster_id, command, internal, container_name=None, timeout=None): @@ -15,10 +12,8 @@ def cluster_run(base_operations, cluster_id, command, internal, container_name=N cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: cluster_nodes = [(node, base_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] - try: + with batch_error_manager(): generated_username, ssh_key = base_operations.generate_user_on_cluster(pool.id, nodes) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) try: output = asyncio.get_event_loop().run_until_complete( diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py index f0935723..7b1dc595 100644 --- a/aztk/client/cluster/helpers/copy.py +++ b/aztk/client/cluster/helpers/copy.py @@ -2,10 +2,9 @@ from azure.batch.models import BatchErrorException -import aztk.models as models -from aztk import error +from aztk import models +from aztk.utils import batch_error_manager from aztk.utils import ssh as ssh_lib -from aztk.utils import helpers def cluster_copy( @@ -25,10 +24,9 @@ def cluster_copy( else: cluster_nodes = [(node, cluster_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] - try: + + with batch_error_manager(): generated_username, ssh_key = cluster_operations.generate_user_on_cluster(pool.id, nodes) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) try: output = asyncio.get_event_loop().run_until_complete( diff --git a/aztk/spark/client/base/helpers/list_applications.py b/aztk/spark/client/base/helpers/list_applications.py index 8945a455..81e79cd8 100644 --- a/aztk/spark/client/base/helpers/list_applications.py +++ b/aztk/spark/client/base/helpers/list_applications.py @@ -1,9 +1,5 @@ -import azure.batch.models as batch_models -from azure.batch.models import BatchErrorException - -from aztk import error from aztk.spark import models -from aztk.utils import helpers +from aztk.utils import batch_error_manager def _list_applications(core_operations, id): @@ -17,7 +13,5 @@ def _list_applications(core_operations, id): def list_applications(core_operations, id): - try: + with batch_error_manager(): return models.Application(_list_applications(core_operations, id)) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/copy.py b/aztk/spark/client/cluster/helpers/copy.py index 6f76a409..9d62d836 100644 --- a/aztk/spark/client/cluster/helpers/copy.py +++ b/aztk/spark/client/cluster/helpers/copy.py @@ -1,7 +1,4 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def cluster_copy( @@ -13,7 +10,7 @@ def cluster_copy( internal: bool = False, timeout: int = None, ): - try: + with batch_error_manager(): container_name = None if host else "spark" return core_cluster_operations.copy( cluster_id, @@ -24,5 +21,3 @@ def cluster_copy( internal=internal, timeout=timeout, ) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index aabfc347..da204f0e 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -1,12 +1,10 @@ import azure.batch.models as batch_models -from azure.batch.models import BatchErrorException -from aztk import error from aztk import models as base_models from aztk.internal.cluster_data import NodeData from aztk.spark import models from aztk.spark.utils import constants, util -from aztk.utils import helpers +from aztk.utils import batch_error_manager POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( @@ -43,7 +41,7 @@ def create_cluster(core_cluster_operations, cluster_conf.validate() cluster_data = core_cluster_operations.get_cluster_data(cluster_conf.cluster_id) - try: + with batch_error_manager(): zip_resource_files = None node_data = NodeData(cluster_conf).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() @@ -71,6 +69,3 @@ def create_cluster(core_cluster_operations, cluster = spark_cluster_operations.get(cluster.id) return cluster - - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/create_user.py b/aztk/spark/client/cluster/helpers/create_user.py index d95016a8..b1edf44c 100644 --- a/aztk/spark/client/cluster/helpers/create_user.py +++ b/aztk/spark/client/cluster/helpers/create_user.py @@ -1,7 +1,5 @@ -from azure.batch.models import BatchErrorException - from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def create_user( @@ -12,11 +10,9 @@ def create_user( password: str = None, ssh_key: str = None, ) -> str: - try: + with batch_error_manager(): cluster = spark_cluster_operations.get(cluster_id) master_node_id = cluster.master_node_id if not master_node_id: raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") core_cluster_operations.create_user_on_cluster(cluster.id, cluster.nodes, username, ssh_key, password) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/delete.py b/aztk/spark/client/cluster/helpers/delete.py index 063907a2..be3a775c 100644 --- a/aztk/spark/client/cluster/helpers/delete.py +++ b/aztk/spark/client/cluster/helpers/delete.py @@ -1,11 +1,6 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def delete_cluster(core_cluster_operations, cluster_id: str, keep_logs: bool = False): - try: + with batch_error_manager(): return core_cluster_operations.delete(cluster_id, keep_logs) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/diagnostics.py b/aztk/spark/client/cluster/helpers/diagnostics.py index 3343f3c8..208641d6 100644 --- a/aztk/spark/client/cluster/helpers/diagnostics.py +++ b/aztk/spark/client/cluster/helpers/diagnostics.py @@ -1,9 +1,7 @@ import os -from azure.batch.models import BatchErrorException - from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=False): @@ -40,8 +38,6 @@ def _build_diagnostic_ssh_command(brief): def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None, brief=False): - try: + with batch_error_manager(): output = _run(spark_cluster_operations, cluster_id, output_directory, brief) return output - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/download.py b/aztk/spark/client/cluster/helpers/download.py index 261640c2..94a9ba8a 100644 --- a/aztk/spark/client/cluster/helpers/download.py +++ b/aztk/spark/client/cluster/helpers/download.py @@ -1,7 +1,4 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def cluster_download( @@ -13,7 +10,7 @@ def cluster_download( internal: bool = False, timeout: int = None, ): - try: + with batch_error_manager(): container_name = None if host else "spark" return core_cluster_operations.copy( cluster_id, @@ -24,5 +21,3 @@ def cluster_download( internal=internal, timeout=timeout, ) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get.py b/aztk/spark/client/cluster/helpers/get.py index 11a7347e..30c4cd11 100644 --- a/aztk/spark/client/cluster/helpers/get.py +++ b/aztk/spark/client/cluster/helpers/get.py @@ -1,13 +1,8 @@ -from azure.batch.models import BatchErrorException - -from aztk import error from aztk.spark import models -from aztk.utils import helpers +from aztk.utils import batch_error_manager def get_cluster(core_cluster_operations, cluster_id: str): - try: + with batch_error_manager(): cluster = core_cluster_operations.get(cluster_id) return models.Cluster(cluster) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_application_state.py b/aztk/spark/client/cluster/helpers/get_application_state.py index e2e06e2e..17d4db9c 100644 --- a/aztk/spark/client/cluster/helpers/get_application_state.py +++ b/aztk/spark/client/cluster/helpers/get_application_state.py @@ -1,12 +1,7 @@ -from azure.batch.models import BatchErrorException - -from aztk import error from aztk.spark.models import ApplicationState -from aztk.utils import helpers +from aztk.utils import batch_error_manager def get_application_state(core_cluster_operations, cluster_id: str, app_name: str): - try: + with batch_error_manager(): return ApplicationState(core_cluster_operations.get_task_state(cluster_id, app_name).value) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_configuration.py b/aztk/spark/client/cluster/helpers/get_configuration.py index e0c19fb4..a722d188 100644 --- a/aztk/spark/client/cluster/helpers/get_configuration.py +++ b/aztk/spark/client/cluster/helpers/get_configuration.py @@ -1,11 +1,6 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def get_configuration(core_cluster_operations, cluster_id: str): - try: + with batch_error_manager(): return core_cluster_operations.get_cluster_configuration(cluster_id) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_remote_login_settings.py b/aztk/spark/client/cluster/helpers/get_remote_login_settings.py index d40ccf16..9a5c55ef 100644 --- a/aztk/spark/client/cluster/helpers/get_remote_login_settings.py +++ b/aztk/spark/client/cluster/helpers/get_remote_login_settings.py @@ -1,12 +1,7 @@ -from azure.batch.models import BatchErrorException - -from aztk import error from aztk.spark import models -from aztk.utils import helpers +from aztk.utils import batch_error_manager def get_remote_login_settings(core_cluster_operations, id: str, node_id: str): - try: + with batch_error_manager(): return models.RemoteLogin(core_cluster_operations.get_remote_login_settings(id, node_id)) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/list.py b/aztk/spark/client/cluster/helpers/list.py index d893859f..9f2e5589 100644 --- a/aztk/spark/client/cluster/helpers/list.py +++ b/aztk/spark/client/cluster/helpers/list.py @@ -1,14 +1,9 @@ -from azure.batch.models import BatchErrorException - -from aztk import error from aztk import models as base_models from aztk.spark import models -from aztk.utils import helpers +from aztk.utils import batch_error_manager def list_clusters(core_cluster_operations): - try: + with batch_error_manager(): software_metadata_key = base_models.Software.spark return [models.Cluster(cluster) for cluster in core_cluster_operations.list(software_metadata_key)] - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/node_run.py b/aztk/spark/client/cluster/helpers/node_run.py index f32e066a..4ac142b9 100644 --- a/aztk/spark/client/cluster/helpers/node_run.py +++ b/aztk/spark/client/cluster/helpers/node_run.py @@ -1,7 +1,4 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def node_run( @@ -14,7 +11,7 @@ def node_run( timeout=None, block=False, ): - try: + with batch_error_manager(): return core_cluster_operations.node_run( cluster_id, node_id, @@ -23,5 +20,3 @@ def node_run( container_name="spark" if not host else None, timeout=timeout, block=block) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/run.py b/aztk/spark/client/cluster/helpers/run.py index b5f36134..c1c7bf9d 100644 --- a/aztk/spark/client/cluster/helpers/run.py +++ b/aztk/spark/client/cluster/helpers/run.py @@ -1,7 +1,4 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def cluster_run(core_cluster_operations, @@ -10,8 +7,6 @@ def cluster_run(core_cluster_operations, host=False, internal: bool = False, timeout=None): - try: + with batch_error_manager(): return core_cluster_operations.run( cluster_id, command, internal, container_name="spark" if not host else None, timeout=timeout) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/ssh_into_master.py b/aztk/spark/client/cluster/helpers/ssh_into_master.py index e79f2c31..1417b078 100644 --- a/aztk/spark/client/cluster/helpers/ssh_into_master.py +++ b/aztk/spark/client/cluster/helpers/ssh_into_master.py @@ -1,7 +1,4 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def ssh_into_master( @@ -14,9 +11,7 @@ def ssh_into_master( port_forward_list=None, internal=False, ): - try: + with batch_error_manager(): master_node_id = spark_cluster_operations.get(cluster_id).master_node_id core_cluster_operations.ssh_into_node(cluster_id, master_node_id, username, ssh_key, password, port_forward_list, internal) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py index 7a4b97e2..4d969faa 100644 --- a/aztk/spark/client/cluster/helpers/submit.py +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -1,11 +1,9 @@ import azure.batch.models as batch_models import yaml -from azure.batch.models import BatchErrorException -from aztk import error from aztk.error import AztkError from aztk.spark import models -from aztk.utils import constants, helpers +from aztk.utils import batch_error_manager, constants, helpers def __get_node(core_cluster_operations, node_id: str, cluster_id: str) -> batch_models.ComputeNode: @@ -119,8 +117,7 @@ def submit( wait: bool = False, internal: bool = False, ): - try: + with batch_error_manager(): + submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait, internal) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/wait.py b/aztk/spark/client/cluster/helpers/wait.py index 95b13efa..a572177f 100644 --- a/aztk/spark/client/cluster/helpers/wait.py +++ b/aztk/spark/client/cluster/helpers/wait.py @@ -5,7 +5,6 @@ def wait_for_application_to_complete(core_cluster_operations, id, application_name): - try: + from aztk.utils import batch_error_manager + with batch_error_manager(): return core_cluster_operations.wait(id, application_name) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/delete.py b/aztk/spark/client/job/helpers/delete.py index 25b77f92..89c29924 100644 --- a/aztk/spark/client/job/helpers/delete.py +++ b/aztk/spark/client/job/helpers/delete.py @@ -29,7 +29,6 @@ def _delete(core_job_operations, spark_job_operations, job_id, keep_logs: bool = @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def delete(core_job_operations, spark_job_operations, job_id: str, keep_logs: bool = False): - try: + from aztk.utils import batch_error_manager + with batch_error_manager(): return _delete(core_job_operations, spark_job_operations, job_id, keep_logs) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get.py b/aztk/spark/client/job/helpers/get.py index 0b904410..490db706 100644 --- a/aztk/spark/client/job/helpers/get.py +++ b/aztk/spark/client/job/helpers/get.py @@ -21,8 +21,8 @@ def _get_job(core_job_operations, job_id): def get_job(core_job_operations, job_id): - try: + from aztk.utils import batch_error_manager + with batch_error_manager(): job, tasks, pool, nodes = _get_job(core_job_operations, job_id) return models.Job(job, tasks, pool, nodes) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) + diff --git a/aztk/spark/client/job/helpers/get_application.py b/aztk/spark/client/job/helpers/get_application.py index cdcc6ee3..563df994 100644 --- a/aztk/spark/client/job/helpers/get_application.py +++ b/aztk/spark/client/job/helpers/get_application.py @@ -20,7 +20,6 @@ def _get_application(core_operations, job_id, application_name): def get_application(core_operations, job_id, application_name): - try: + from aztk.utils import batch_error_manager + with batch_error_manager(): return models.Application(_get_application(core_operations, job_id, application_name)) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_application_log.py b/aztk/spark/client/job/helpers/get_application_log.py index b4f23112..5db3479e 100644 --- a/aztk/spark/client/job/helpers/get_application_log.py +++ b/aztk/spark/client/job/helpers/get_application_log.py @@ -38,8 +38,7 @@ def _get_application_log(core_job_operations, spark_job_operations, job_id, appl def get_job_application_log(core_job_operations, spark_job_operations, job_id, application_name): - try: + from aztk.utils import batch_error_manager + with batch_error_manager(): return models.ApplicationLog( _get_application_log(core_job_operations, spark_job_operations, job_id, application_name)) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/list.py b/aztk/spark/client/job/helpers/list.py index 3551fdd3..4592b19d 100644 --- a/aztk/spark/client/job/helpers/list.py +++ b/aztk/spark/client/job/helpers/list.py @@ -10,7 +10,6 @@ def _list_jobs(core_job_operations): def list_jobs(core_job_operations): - try: + from aztk.utils import batch_error_manager + with batch_error_manager(): return [models.Job(cloud_job_schedule) for cloud_job_schedule in _list_jobs(core_job_operations)] - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/list_applications.py b/aztk/spark/client/job/helpers/list_applications.py index ed7a75ec..72578979 100644 --- a/aztk/spark/client/job/helpers/list_applications.py +++ b/aztk/spark/client/job/helpers/list_applications.py @@ -26,11 +26,10 @@ def _list_applications(core_job_operations, job_id): # currently, it returns a dictionary indicating whether # a task has been scheduled or not def list_applications(core_job_operations, job_id): - try: + from aztk.utils import batch_error_manager + with batch_error_manager(): applications = _list_applications(core_job_operations, job_id) for item in applications: if applications[item]: applications[item] = models.Application(applications[item]) return applications - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/stop.py b/aztk/spark/client/job/helpers/stop.py index 1d961915..efce8c44 100644 --- a/aztk/spark/client/job/helpers/stop.py +++ b/aztk/spark/client/job/helpers/stop.py @@ -1,7 +1,4 @@ -from azure.batch.models import BatchErrorException - -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def _stop(core_job_operations, job_id): @@ -13,7 +10,5 @@ def _stop(core_job_operations, job_id): def stop(self, job_id): - try: + with batch_error_manager(): return _stop(self, job_id) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index 35a5c980..804c1bb5 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -1,13 +1,12 @@ import azure.batch.models as batch_models import yaml -from azure.batch.models import BatchErrorException from aztk import error from aztk import models as base_models from aztk.internal.cluster_data import NodeData from aztk.spark import models from aztk.spark.models import SchedulingTarget -from aztk.utils import helpers +from aztk.utils import batch_error_manager, helpers from aztk.utils.command_builder import CommandBuilder @@ -78,7 +77,7 @@ def submit_job(core_job_operations, spark_job_operations, job_configuration: models.JobConfiguration, wait: bool = False): - try: + with batch_error_manager(): job_configuration = _apply_default_for_job_config(job_configuration) job_configuration.validate() cluster_data = core_job_operations.get_cluster_data(job_configuration.id) @@ -126,6 +125,3 @@ def submit_job(core_job_operations, spark_job_operations.wait(id=job_configuration.id) return models.Job(job) - - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/wait_until_complete.py b/aztk/spark/client/job/helpers/wait_until_complete.py index 2ef44aa9..e3036088 100644 --- a/aztk/spark/client/job/helpers/wait_until_complete.py +++ b/aztk/spark/client/job/helpers/wait_until_complete.py @@ -1,10 +1,8 @@ import time import azure.batch.models as batch_models -from azure.batch.models import BatchErrorException -from aztk import error -from aztk.utils import helpers +from aztk.utils import batch_error_manager def _wait_until_job_finished(core_job_operations, job_id): @@ -16,7 +14,5 @@ def _wait_until_job_finished(core_job_operations, job_id): def wait_until_job_finished(core_job_operations, job_id): - try: + with batch_error_manager(): _wait_until_job_finished(core_job_operations, job_id) - except BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/utils/__init__.py b/aztk/utils/__init__.py index 9b640a3d..8a225f98 100644 --- a/aztk/utils/__init__.py +++ b/aztk/utils/__init__.py @@ -1,4 +1,5 @@ from . import (azure_api, command_builder, constants, file_utils, get_ssh_key, helpers, secure_utils) +from .batch_error_manager import batch_error_manager from .deprecation import deprecate, deprecated from .retry import BackOffPolicy, retry from .try_func import try_func diff --git a/aztk/utils/batch_error_manager.py b/aztk/utils/batch_error_manager.py new file mode 100644 index 00000000..3ae0793f --- /dev/null +++ b/aztk/utils/batch_error_manager.py @@ -0,0 +1,14 @@ +from contextlib import contextmanager + +from azure.batch.models import BatchErrorException + +from aztk import error +from aztk.utils import constants, helpers + + +@contextmanager +def batch_error_manager(): + try: + yield + except BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) From 9ad56c728b2eab897edbc18c7810aa29f0698979 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 5 Nov 2018 13:57:53 -0800 Subject: [PATCH 11/28] fix type error --- aztk/client/base/helpers/get_application_log.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index 51a2e46a..4b3c826b 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -31,7 +31,7 @@ def wait_for_batch_task(base_operations, cluster_id: str, application_name: str) def wait_for_scheduling_target_task(base_operations, cluster_id, application_name): # TODO: ensure get_task_state not None or throw task = base_operations.get_task_from_table(cluster_id, application_name) - while task.state not in [TaskState.Completed, TaskState.Failed, Task.Running]: + while task.state not in [TaskState.Completed, TaskState.Failed, TaskState.Running]: time.sleep(3) # TODO: enable logger # log.debug("{} {}: application not yet complete".format(cluster_id, application_name)) From 1a06d64f7e6e46749cc85d3c9cd75ccd9060984c Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 5 Nov 2018 13:58:28 -0800 Subject: [PATCH 12/28] remove unused import --- aztk/spark/client/job/helpers/submit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index 804c1bb5..084ad1c2 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -1,7 +1,6 @@ import azure.batch.models as batch_models import yaml -from aztk import error from aztk import models as base_models from aztk.internal.cluster_data import NodeData from aztk.spark import models From 0aa8733690f24b32ce0828c8141a7b735e78ab12 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 5 Nov 2018 16:34:23 -0800 Subject: [PATCH 13/28] update cli and fix size 0 log get --- .../client/base/helpers/get_application_log.py | 18 ++++++++++++++---- .../endpoints/cluster/cluster_app_logs.py | 4 ++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index 4b3c826b..79e6f328 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -41,20 +41,28 @@ def wait_for_scheduling_target_task(base_operations, cluster_id, application_nam def get_blob_from_storage(block_blob_client, container_name, application_name, stream, start_range): try: - return block_blob_client.get_blob_to_stream( + blob = block_blob_client.get_blob_to_stream( container_name, convert_application_name_to_blob_path(application_name), stream, start_range=start_range, ) + stream.seek(0) + return blob except azure.common.AzureMissingResourceHttpError: raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") + except azure.common.AzureHttpError as e: + if e.error_code == "InvalidRange": + # the blob has no data, should not throw here + raise error.AztkError("The application {} log has no data yet.".format(application_name)) + raise def get_log_from_storage(blob_client, container_name, application_name, task, current_bytes): - stream = tempfile.SpooledTemporaryFile(max_size=2 * 1024 * 1024) - blob = get_blob_from_storage(blob_client.create_block_blob_service, container_name, application_name, stream, + stream = tempfile.TemporaryFile() + blob = get_blob_from_storage(blob_client.create_block_blob_service(), container_name, application_name, stream, current_bytes) + stream.seek(0) return models.ApplicationLog( name=application_name, cluster_id=container_name, @@ -86,7 +94,7 @@ def stream_log_from_storage(base_operations, container_name, application_name, t ) while task.state not in [TaskState.Completed, TaskState.Failed]: - task = base_operations.get_task_from_table(task.id, application_name) #TODO: is this a race condiition? + task = base_operations.get_task_from_table(task.id, application_name) last_read_byte = blob.properties.content_length blob = get_blob_from_storage( block_blob_client, @@ -96,6 +104,8 @@ def stream_log_from_storage(base_operations, container_name, application_name, t start_range=last_read_byte, ) + stream.seek(0) + return models.ApplicationLog( name=application_name, cluster_id=container_name, diff --git a/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py b/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py index 927a6719..8aacde85 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py @@ -3,7 +3,7 @@ import typing import aztk -from aztk_cli import config, utils, log +from aztk_cli import config, log, utils def setup_parser(parser: argparse.ArgumentParser): @@ -32,4 +32,4 @@ def execute(args: typing.NamedTuple): with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: f.write(app_log.log) else: - log.print(app_log.log) + log.print(app_log.log.read().decode('utf-8')) From aabebd1d10f17b0141e1214d90aad41f6fef3020 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 5 Nov 2018 17:18:15 -0800 Subject: [PATCH 14/28] change default max_block_size --- aztk/node_scripts/scheduling/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aztk/node_scripts/scheduling/common.py b/aztk/node_scripts/scheduling/common.py index 3d5ba941..c5154736 100644 --- a/aztk/node_scripts/scheduling/common.py +++ b/aztk/node_scripts/scheduling/common.py @@ -118,7 +118,7 @@ def stream_upload_to_storage( application_name (`str`): the name of the application to uploads logs for """ from azure.storage.blob.appendblobservice import AppendBlobService - AppendBlobService.MAX_BLOCK_SIZE = 10 * 1024 + AppendBlobService.MAX_BLOCK_SIZE = 1024 * 1024 append_blob_client = blob_client.create_append_blob_service() From 4fc6eb0703a330575126310a8030b84580fcb7f8 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 6 Nov 2018 15:04:12 -0800 Subject: [PATCH 15/28] refactor get_application_log, task abstraction --- aztk/client/base/base_operations.py | 48 +++------------- .../base/helpers/get_application_log.py | 57 ++++++++++++------- aztk/client/base/helpers/get_task.py | 22 +++++++ aztk/client/base/helpers/get_task_state.py | 2 +- aztk/client/base/helpers/list_tasks.py | 13 +++-- .../scheduling/scheduling_target.py | 2 +- .../client/base/helpers/list_applications.py | 12 +--- aztk/spark/client/base/operations.py | 2 +- .../client/job/helpers/get_application.py | 7 +-- .../client/job/helpers/get_application_log.py | 9 ++- aztk/utils/constants.py | 4 ++ 11 files changed, 87 insertions(+), 91 deletions(-) create mode 100644 aztk/client/base/helpers/get_task.py diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 5dca32ce..13525ad9 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -3,7 +3,8 @@ from .helpers import (create_user_on_cluster, create_user_on_node, delete_user_on_cluster, delete_user_on_node, generate_user_on_cluster, generate_user_on_node, get_application_log, get_recent_job, - get_remote_login_settings, get_task_state, list_tasks, node_run, run, ssh_into_node, task_table) + get_remote_login_settings, get_task, get_task_state, list_tasks, node_run, run, ssh_into_node, + task_table) class BaseOperations: @@ -234,28 +235,6 @@ def create_task_table(self, id: str): """ return task_table.create_task_table(self.table_service, id) - def list_task_table_entries(self, id): - """list tasks in a storage table - - Args: - id (:obj:`str`): the id of the cluster - - Returns: - :obj:`[aztk.models.Task]`: a list of models representing all entries in the Task table - """ - return task_table.list_task_table_entries(self.table_service, id) - - def get_task_from_table(self, id, task_id): - """Create a storage table to track tasks - - Args: - id (:obj:`str`): the id of the cluster - - Returns: - :obj:`[aztk.models.Task]`: the task with id task_id from the cluster's storage table - """ - return task_table.get_task_from_table(self.table_service, id, task_id) - def insert_task_into_task_table(self, id, task): """Insert a task into the table @@ -322,25 +301,16 @@ def get_task_state(self, id: str, task_name: str): """ return get_task_state.get_task_state(self, id, task_name) - def list_batch_tasks(self, id: str): - """Get the status of a submitted task + def get_task(self, id: str, task_id: str): + """Get a task submitted to a cluster Args: - id (:obj:`str`): the name of the cluster the task was submitted to + id (:obj:`str`): the id of the cluster Returns: - :obj:`[aztk.models.Task]`: list of aztk tasks + :obj:`[aztk.models.Task]`: the submitted task with id task_id """ - return task_table.list_batch_tasks(self.batch_client, id) + get_task.get_task(self, id, task_id) - def get_batch_task(self, id: str, task_id: str): - """Get the status of a submitted task - - Args: - id (:obj:`str`): the name of the cluster the task was submitted to - task_id (:obj:`str`): the name of the task to get - - Returns: - :obj:`aztk.models.Task`: aztk Task representing the Batch Task - """ - return task_table.get_batch_task(self.batch_client, id, task_id) + def update_task(self, id: str, task_id: str): + pass diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index 79e6f328..9bbadf89 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -30,29 +30,49 @@ def wait_for_batch_task(base_operations, cluster_id: str, application_name: str) def wait_for_scheduling_target_task(base_operations, cluster_id, application_name): # TODO: ensure get_task_state not None or throw - task = base_operations.get_task_from_table(cluster_id, application_name) + task = base_operations.get_task(cluster_id, application_name) while task.state not in [TaskState.Completed, TaskState.Failed, TaskState.Running]: time.sleep(3) # TODO: enable logger # log.debug("{} {}: application not yet complete".format(cluster_id, application_name)) - task = base_operations.get_task_from_table(cluster_id, application_name) + task = base_operations.get_task(cluster_id, application_name) return task -def get_blob_from_storage(block_blob_client, container_name, application_name, stream, start_range): +def wait_for_task(base_operations, cluster_id: str, application_name: str, cluster_configuration): + if cluster_configuration.scheduling_target is not models.SchedulingTarget.Any: + task = wait_for_scheduling_target_task(base_operations, cluster_id, application_name) + else: + task = wait_for_batch_task(base_operations, cluster_id, application_name) + return task + + +def get_blob_from_storage(block_blob_client, container_name, application_name, stream, start_range, end_range=None): + print(block_blob_client, container_name, application_name, stream, start_range, end_range) + previous = 0 + + def download_callback(current, total): + nonlocal previous + stream.seek(previous) + print("({}/{})".format(previous, current)) + # print(stream.read().decode('utf-8')) # SDK SHOULDN'T PRINT + previous = current + try: blob = block_blob_client.get_blob_to_stream( container_name, convert_application_name_to_blob_path(application_name), stream, + progress_callback=download_callback, start_range=start_range, - ) + end_range=end_range) stream.seek(0) return blob except azure.common.AzureMissingResourceHttpError: + raise raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") except azure.common.AzureHttpError as e: - if e.error_code == "InvalidRange": + if e.error_code in ["InvalidRange"]: # the blob has no data, should not throw here raise error.AztkError("The application {} log has no data yet.".format(application_name)) raise @@ -62,7 +82,6 @@ def get_log_from_storage(blob_client, container_name, application_name, task, cu stream = tempfile.TemporaryFile() blob = get_blob_from_storage(blob_client.create_block_blob_service(), container_name, application_name, stream, current_bytes) - stream.seek(0) return models.ApplicationLog( name=application_name, cluster_id=container_name, @@ -81,25 +100,27 @@ def stream_log_from_storage(base_operations, container_name, application_name, t application_name (:obj:`str`): the name of the application to get logs for task (:obj:`aztk.models.Task`): the aztk task for for this application """ - stream = tempfile.SpooledTemporaryFile(max_size=2 * 1024 * 1024) + stream = tempfile.TemporaryFile() last_read_byte = 0 block_blob_client = base_operations.blob_client.create_block_blob_service() blob = get_blob_from_storage( block_blob_client, container_name, - convert_application_name_to_blob_path(application_name), + application_name, stream, start_range=last_read_byte, + end_range=last_read_byte + constants.STREAMING_DOWNLOAD_CHUNK_SIZE, ) while task.state not in [TaskState.Completed, TaskState.Failed]: - task = base_operations.get_task_from_table(task.id, application_name) + print(container_name, task.id) + task = base_operations.get_task(container_name, task.id) last_read_byte = blob.properties.content_length blob = get_blob_from_storage( block_blob_client, container_name, - convert_application_name_to_blob_path(application_name), + application_name, stream, start_range=last_read_byte, ) @@ -118,26 +139,18 @@ def stream_log_from_storage(base_operations, container_name, application_name, t def get_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): cluster_configuration = base_operations.get_cluster_configuration(cluster_id) - - if cluster_configuration.scheduling_target is not models.SchedulingTarget.Any: - task = wait_for_scheduling_target_task(base_operations, cluster_id, application_name) - else: - task = wait_for_batch_task(base_operations, cluster_id, application_name) + task = wait_for_task(base_operations, cluster_id, application_name, cluster_configuration) return get_log_from_storage(base_operations.blob_client, cluster_id, application_name, task, current_bytes) def stream_log(base_operations, cluster_id: str, application_name: str): cluster_configuration = base_operations.get_cluster_configuration(cluster_id) - - if cluster_configuration.scheduling_target is not models.SchedulingTarget.Any: - task = wait_for_scheduling_target_task(base_operations, cluster_id, application_name) - else: - task = wait_for_batch_task(base_operations, cluster_id, application_name) - + task = wait_for_task(base_operations, cluster_id, application_name, cluster_configuration) return stream_log_from_storage(base_operations, cluster_id, application_name, task) def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): with batch_error_manager(): - return get_log(base_operations, cluster_id, application_name, tail, current_bytes) + # return get_log(base_operations, cluster_id, application_name, tail, current_bytes) + return stream_log(base_operations, cluster_id, application_name) diff --git a/aztk/client/base/helpers/get_task.py b/aztk/client/base/helpers/get_task.py new file mode 100644 index 00000000..3f64894c --- /dev/null +++ b/aztk/client/base/helpers/get_task.py @@ -0,0 +1,22 @@ +from aztk.models import SchedulingTarget + +from .get_recent_job import get_recent_job +from .task_table import get_batch_task, get_task_from_table + + +def get_task(core_base_operations, id, task_id): + """List all tasks on a job or cluster + + This will work for both Batch scheduling and scheduling_target + + Args: + id: cluster or job id + Returns: + List[aztk.models.Task] + + """ + scheduling_target = core_base_operations.get_cluster_configuration(id).scheduling_target + if scheduling_target is not SchedulingTarget.Any: + return get_task_from_table(core_base_operations.table_service, id, task_id) + else: + return get_batch_task(core_base_operations.batch_client, id, task_id) diff --git a/aztk/client/base/helpers/get_task_state.py b/aztk/client/base/helpers/get_task_state.py index 9692587e..c0069a23 100644 --- a/aztk/client/base/helpers/get_task_state.py +++ b/aztk/client/base/helpers/get_task_state.py @@ -6,7 +6,7 @@ def get_task_state(core_cluster_operations, cluster_id: str, task_id: str): with batch_error_manager(): scheduling_target = core_cluster_operations.get_cluster_configuration(cluster_id).scheduling_target if scheduling_target is not SchedulingTarget.Any: - task = core_cluster_operations.get_task_from_table(cluster_id, task_id) + task = core_cluster_operations.get_task(cluster_id, task_id) return task.state else: task = core_cluster_operations.get_batch_task(cluster_id, task_id) diff --git a/aztk/client/base/helpers/list_tasks.py b/aztk/client/base/helpers/list_tasks.py index 969c820b..8dec0583 100644 --- a/aztk/client/base/helpers/list_tasks.py +++ b/aztk/client/base/helpers/list_tasks.py @@ -1,7 +1,9 @@ +from azure.batch.models import BatchErrorException + from aztk.models import SchedulingTarget from .get_recent_job import get_recent_job -from .task_table import list_task_table_entries +from .task_table import list_batch_tasks, list_task_table_entries def list_tasks(core_base_operations, id): @@ -19,8 +21,9 @@ def list_tasks(core_base_operations, id): if scheduling_target is not SchedulingTarget.Any: return list_task_table_entries(core_base_operations.table_service, id) else: - # note: this currently only works for job_schedules - # cluster impl is planned to move to job schedules - recent_run_job = get_recent_job(core_base_operations, id) - tasks = core_base_operations.list_batch_tasks(id=recent_run_job.id) + try: + recent_run_job = get_recent_job(core_base_operations, id) + tasks = list_batch_tasks(batch_client=core_base_operations.batch_client, id=recent_run_job.id) + except BatchErrorException: + tasks = list_batch_tasks(batch_client=core_base_operations.batch_client, id=id) return tasks diff --git a/aztk/node_scripts/scheduling/scheduling_target.py b/aztk/node_scripts/scheduling/scheduling_target.py index 602561e4..24f49420 100644 --- a/aztk/node_scripts/scheduling/scheduling_target.py +++ b/aztk/node_scripts/scheduling/scheduling_target.py @@ -75,7 +75,7 @@ def insert_task_into_task_table(cluster_id, task_definition): def get_task(cluster_id, task_id): - return config.spark_client.cluster._core_cluster_operations.get_task_from_table(cluster_id, task_id) + return config.spark_client.cluster._core_cluster_operations.get_task(cluster_id, task_id) def mark_task_complete(cluster_id, task_id, exit_code): diff --git a/aztk/spark/client/base/helpers/list_applications.py b/aztk/spark/client/base/helpers/list_applications.py index 81e79cd8..c5dafd5c 100644 --- a/aztk/spark/client/base/helpers/list_applications.py +++ b/aztk/spark/client/base/helpers/list_applications.py @@ -2,16 +2,6 @@ from aztk.utils import batch_error_manager -def _list_applications(core_operations, id): - # info about the app - scheduling_target = core_operations.get_cluster_configuration(id).scheduling_target - if scheduling_target is not models.SchedulingTarget.Any: - return models.Application(core_operations.list_applications(id)) - - recent_run_job = core_operations.get_recent_job(id) - return core_operations.list_batch_tasks(id=recent_run_job.id) - - def list_applications(core_operations, id): with batch_error_manager(): - return models.Application(_list_applications(core_operations, id)) + return models.Application(core_operations.list_tasks(id)) diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index d2d428b1..0cfc460e 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -80,6 +80,6 @@ def list_applications(self, id): application_name (:obj:`str`): the name of the application to get Returns: - :obj:`aztk.spark.models.Application`: object representing that state and output of an application + :obj:`List[aztk.spark.models.Application]`: A list of the applications submitted in the job """ return list_applications.list_applications(self, id) diff --git a/aztk/spark/client/job/helpers/get_application.py b/aztk/spark/client/job/helpers/get_application.py index 563df994..481635d0 100644 --- a/aztk/spark/client/job/helpers/get_application.py +++ b/aztk/spark/client/job/helpers/get_application.py @@ -7,13 +7,8 @@ def _get_application(core_operations, job_id, application_name): - # info about the app - recent_run_job = core_operations.get_recent_job(job_id) - scheduling_target = core_operations.get_cluster_configuration(job_id).scheduling_target - if scheduling_target is not models.SchedulingTarget.Any: - return core_operations.get_task_from_table(job_id, application_name) try: - return core_operations.get_batch_task(id=recent_run_job.id, task_id=application_name) + return core_operations.get_task(id=job_id, task_id=application_name) except batch_models.BatchErrorException: raise error.AztkError( "The Spark application {0} is still being provisioned or does not exist.".format(application_name)) diff --git a/aztk/spark/client/job/helpers/get_application_log.py b/aztk/spark/client/job/helpers/get_application_log.py index 5db3479e..f5d1ac2f 100644 --- a/aztk/spark/client/job/helpers/get_application_log.py +++ b/aztk/spark/client/job/helpers/get_application_log.py @@ -1,9 +1,8 @@ import azure.batch.models as batch_models -from azure.batch.models import BatchErrorException from aztk import error +from aztk.models import TaskState from aztk.spark import models -from aztk.utils import helpers def _get_application_log(core_job_operations, spark_job_operations, job_id, application_name): @@ -28,9 +27,9 @@ def _get_application_log(core_job_operations, spark_job_operations, job_id, appl raise error.AztkError("The application {0} does not exist".format(application_name)) else: if task.state in ( - batch_models.TaskState.active, - batch_models.TaskState.running, - batch_models.TaskState.preparing, + TaskState.active, + TaskState.running, + TaskState.preparing, ): raise error.AztkError("The application {0} has not yet finished executing.".format(application_name)) diff --git a/aztk/utils/constants.py b/aztk/utils/constants.py index f4096615..67bcc55d 100644 --- a/aztk/utils/constants.py +++ b/aztk/utils/constants.py @@ -65,3 +65,7 @@ TASK_WORKING_DIR = "wd" SPARK_SUBMIT_LOGS_FILE = "output.log" +""" + log streaming +""" +STREAMING_DOWNLOAD_CHUNK_SIZE = 1048576 # 1024 * 1024 From 4e029c76d965b82b9e58bc805b9c0e320ea572c4 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 6 Nov 2018 17:01:57 -0800 Subject: [PATCH 16/28] start alignment of batch resource usage for clusters and jobs --- aztk/client/base/base_operations.py | 67 ++++++++++++++----- .../base/helpers/create_batch_resources.py | 63 +++++++++++++++++ aztk/client/cluster/helpers/create.py | 59 ++++------------ aztk/client/cluster/helpers/get.py | 19 ++++-- aztk/client/job/helpers/submit.py | 60 +++-------------- aztk/utils/helpers.py | 7 -- 6 files changed, 153 insertions(+), 122 deletions(-) create mode 100644 aztk/client/base/helpers/create_batch_resources.py diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 13525ad9..fe831b92 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -1,10 +1,10 @@ from aztk import models from aztk.internal import cluster_data -from .helpers import (create_user_on_cluster, create_user_on_node, delete_user_on_cluster, delete_user_on_node, - generate_user_on_cluster, generate_user_on_node, get_application_log, get_recent_job, - get_remote_login_settings, get_task, get_task_state, list_tasks, node_run, run, ssh_into_node, - task_table) +from .helpers import (create_batch_resources, create_user_on_cluster, create_user_on_node, delete_user_on_cluster, + delete_user_on_node, generate_user_on_cluster, generate_user_on_node, get_application_log, + get_recent_job, get_remote_login_settings, get_task, get_task_state, list_tasks, node_run, run, + ssh_into_node, task_table) class BaseOperations: @@ -76,6 +76,42 @@ def ssh_into_node(self, id, node_id, username, ssh_key=None, password=None, port """ ssh_into_node.ssh_into_node(self, id, node_id, username, ssh_key, password, port_forward_list, internal) + def create_batch_resources( + self, + id, + start_task, + job_manager_task, + vm_size, + vm_image_model, + on_all_tasks_complete, + mixed_mode, + software_metadata_key, + size_dedicated, + size_low_priority, + subnet_id, + ): + """Create the underlying batch resources for a cluster or a job + Args: + ... + Returns: + ... + """ + + return create_batch_resources.create_batch_resources( + self.batch_client, + id, + start_task, + job_manager_task, + vm_size, + vm_image_model, + on_all_tasks_complete, + mixed_mode, + software_metadata_key, + size_dedicated, + size_low_priority, + subnet_id, + ) + def create_user_on_node(self, id, node_id, username, ssh_key=None, password=None): """Create a user on a node @@ -227,6 +263,16 @@ def get_application_log(self, id: str, application_name: str, tail=False, curren """ return get_application_log.get_application_log(self, id, application_name, tail, current_bytes) + def get_recent_job(self, id): + """Get the most recently run job in an Azure Batch job schedule + + Args: + id (:obj:`str`): the id of the job schedule + Returns: + :obj:`[azure.batch.models.Job]`: the most recently run job on the job schedule + """ + return get_recent_job.get_recent_job(self, id) + def create_task_table(self, id: str): """Create an Azure Table Storage to track tasks @@ -279,16 +325,6 @@ def list_tasks(self, id): """ return list_tasks.list_tasks(self, id) - def get_recent_job(self, id): - """Get the most recently run job in an Azure Batch job schedule - - Args: - id (:obj:`str`): the id of the job schedule - Returns: - :obj:`[azure.batch.models.Job]`: the most recently run job on the job schedule - """ - return get_recent_job.get_recent_job(self, id) - def get_task_state(self, id: str, task_name: str): """Get the status of a submitted task @@ -311,6 +347,3 @@ def get_task(self, id: str, task_id: str): :obj:`[aztk.models.Task]`: the submitted task with id task_id """ get_task.get_task(self, id, task_id) - - def update_task(self, id: str, task_id: str): - pass diff --git a/aztk/client/base/helpers/create_batch_resources.py b/aztk/client/base/helpers/create_batch_resources.py new file mode 100644 index 00000000..8f9bb1f5 --- /dev/null +++ b/aztk/client/base/helpers/create_batch_resources.py @@ -0,0 +1,63 @@ +from datetime import timedelta + +import azure.batch.models as batch_models + +from aztk.utils import constants, helpers + + +def create_batch_resources( + batch_client, + id, + start_task, + job_manager_task, + vm_size, + vm_image_model, + on_all_tasks_complete, + mixed_mode, + software_metadata_key, + size_dedicated, + size_low_priority, + subnet_id, +): + autoscale_formula = "$TargetDedicatedNodes = {0}; " "$TargetLowPriorityNodes = {1}".format( + size_dedicated, size_low_priority) + + sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku( + vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, batch_client) + + network_conf = None + if subnet_id is not None: + network_conf = batch_models.NetworkConfiguration(subnet_id=subnet_id) + + auto_pool_specification = batch_models.AutoPoolSpecification( + pool_lifetime_option=batch_models.PoolLifetimeOption.job, + auto_pool_id_prefix=id, + keep_alive=False, + pool=batch_models.PoolSpecification( + display_name=id, + virtual_machine_configuration=batch_models.VirtualMachineConfiguration( + image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), + vm_size=vm_size, + enable_auto_scale=True, + auto_scale_formula=autoscale_formula, + auto_scale_evaluation_interval=timedelta(minutes=5), + start_task=start_task, + enable_inter_node_communication=not mixed_mode, + network_configuration=network_conf, + max_tasks_per_node=4, + metadata=[ + batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), + batch_models.MetadataItem( + name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA), + ], + ), + ) + + job = batch_models.JobAddParameter( + id=id, + pool_info=batch_models.PoolInformation(auto_pool_specification=auto_pool_specification), + job_manager_task=job_manager_task, + on_all_tasks_complete=on_all_tasks_complete, + ) + + return batch_client.job.add(job) diff --git a/aztk/client/cluster/helpers/create.py b/aztk/client/cluster/helpers/create.py index 91ef524d..7c5a3e16 100644 --- a/aztk/client/cluster/helpers/create.py +++ b/aztk/client/cluster/helpers/create.py @@ -1,9 +1,7 @@ -from datetime import timedelta - import azure.batch.models as batch_models from aztk import models -from aztk.utils import constants, helpers +from aztk.utils import helpers def create_pool_and_job_and_table( @@ -11,7 +9,7 @@ def create_pool_and_job_and_table( cluster_conf: models.ClusterConfiguration, software_metadata_key: str, start_task, - VmImageModel, + vm_image_model, ): """ Create a pool and job @@ -25,51 +23,22 @@ def create_pool_and_job_and_table( # save cluster configuration in storage core_cluster_operations.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf) - # reuse pool_id as job_id - pool_id = cluster_conf.cluster_id - job_id = cluster_conf.cluster_id - - # Get a verified node agent sku - sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku( - VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, core_cluster_operations.batch_client) - - network_conf = None - if cluster_conf.subnet_id is not None: - network_conf = batch_models.NetworkConfiguration(subnet_id=cluster_conf.subnet_id) - auto_scale_formula = "$TargetDedicatedNodes={0}; $TargetLowPriorityNodes={1}".format( - cluster_conf.size, cluster_conf.size_low_priority) - - # Configure the pool - pool = batch_models.PoolAddParameter( - id=pool_id, - virtual_machine_configuration=batch_models.VirtualMachineConfiguration( - image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), - vm_size=cluster_conf.vm_size, - enable_auto_scale=True, - auto_scale_formula=auto_scale_formula, - auto_scale_evaluation_interval=timedelta(minutes=5), + core_cluster_operations.create_batch_resources( + id=cluster_conf.cluster_id, start_task=start_task, - enable_inter_node_communication=True if not cluster_conf.subnet_id else False, - max_tasks_per_node=4, - network_configuration=network_conf, - metadata=[ - batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), - batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA), - ], + job_manager_task=None, + vm_size=cluster_conf.vm_size, + vm_image_model=vm_image_model, + on_all_tasks_complete=batch_models.OnAllTasksComplete.no_action, + mixed_mode=cluster_conf.mixed_mode, + software_metadata_key=software_metadata_key, + size_dedicated=cluster_conf.size, + size_low_priority=cluster_conf.size_low_priority, + subnet_id=cluster_conf.subnet_id, ) - # Create the pool + create user for the pool - helpers.create_pool_if_not_exist(pool, core_cluster_operations.batch_client) - - # Create job - job = batch_models.JobAddParameter(id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id)) - - # Add job to batch - core_cluster_operations.batch_client.job.add(job) - # create storage task table if cluster_conf.scheduling_target != models.SchedulingTarget.Any: core_cluster_operations.create_task_table(cluster_conf.cluster_id) - return helpers.get_cluster(cluster_conf.cluster_id, core_cluster_operations.batch_client) + return core_cluster_operations.get(cluster_conf.cluster_id) diff --git a/aztk/client/cluster/helpers/get.py b/aztk/client/cluster/helpers/get.py index d8674afd..e46d4911 100644 --- a/aztk/client/cluster/helpers/get.py +++ b/aztk/client/cluster/helpers/get.py @@ -1,6 +1,16 @@ -# TODO: return Cluster instead of (pool, nodes) -from aztk import models +import azure.batch.models as batch_models +from aztk import error, models + + +def convert_job_id_to_pool_id(batch_client, cluster_id): + jobs = batch_client.pool.list(pool_list_options=batch_models.PoolListOptions(filter="id eq {}".format(cluster_id))) + job = next(jobs) + assert str.split('_', job.id)[0] == cluster_id + print(job.pool_info.auto_pool_specification.pool.__dict__) + if job.pool_info: + return job.pool_info.pool_id + raise error.AztkError("No cluster with id {} does not exist.".format(cluster_id)) def get_pool_details(core_cluster_operations, cluster_id: str): """ @@ -8,6 +18,7 @@ def get_pool_details(core_cluster_operations, cluster_id: str): :param cluster_id: Id of the cluster :return pool: CloudPool, nodes: ComputeNodePaged """ - pool = core_cluster_operations.batch_client.pool.get(cluster_id) - nodes = core_cluster_operations.batch_client.compute_node.list(pool_id=cluster_id) + pool_id = convert_job_id_to_pool_id(core_cluster_operations.batch_client, cluster_id) + pool = core_cluster_operations.batch_client.pool.get(pool_id) + nodes = core_cluster_operations.batch_client.compute_node.list(pool_id=pool_id) return models.Cluster(pool, nodes) diff --git a/aztk/client/job/helpers/submit.py b/aztk/client/job/helpers/submit.py index 65a9d174..8c18dacf 100644 --- a/aztk/client/job/helpers/submit.py +++ b/aztk/client/job/helpers/submit.py @@ -29,58 +29,20 @@ def submit_job( core_job_operations.get_cluster_data(job_configuration.id).save_cluster_config( job_configuration.to_cluster_config()) - # get a verified node agent sku - sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku( - vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, core_job_operations.batch_client) - - # set up subnet if necessary - network_conf = None - if job_configuration.subnet_id: - network_conf = batch_models.NetworkConfiguration(subnet_id=job_configuration.subnet_id) - - # set up a schedule for a recurring job - auto_pool_specification = batch_models.AutoPoolSpecification( - pool_lifetime_option=batch_models.PoolLifetimeOption.job_schedule, - auto_pool_id_prefix=job_configuration.id, - keep_alive=False, - pool=batch_models.PoolSpecification( - display_name=job_configuration.id, - virtual_machine_configuration=batch_models.VirtualMachineConfiguration( - image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), - vm_size=job_configuration.vm_size, - enable_auto_scale=True, - auto_scale_formula=autoscale_formula, - auto_scale_evaluation_interval=timedelta(minutes=5), - start_task=start_task, - enable_inter_node_communication=not job_configuration.mixed_mode(), - network_configuration=network_conf, - max_tasks_per_node=4, - metadata=[ - batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), - batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA), - ], - ), - ) - - # define job specification - job_spec = batch_models.JobSpecification( - pool_info=batch_models.PoolInformation(auto_pool_specification=auto_pool_specification), - display_name=job_configuration.id, - on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, + core_job_operations.create_batch_resources( + id=job_configuration.cluster_id, + start_task=start_task, job_manager_task=job_manager_task, - metadata=[batch_models.MetadataItem(name="applications", value=application_metadata)], + vm_size=job_configuration.vm_size, + vm_image_model=vm_image_model, + on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, + mixed_mode=job_configuration.mixed_mode, + software_metadata_key=software_metadata_key, + size_dedicated=job_configuration.max_dedicated_nodes, + size_low_priority=job_configuration.max_low_pri_nodes, + subnet_id=job_configuration.subnet_id, ) - # define schedule - schedule = batch_models.Schedule( - do_not_run_until=None, do_not_run_after=None, start_window=None, recurrence_interval=None) - - # create job schedule and add task - setup = batch_models.JobScheduleAddParameter(id=job_configuration.id, schedule=schedule, job_specification=job_spec) - - core_job_operations.batch_client.job_schedule.add(setup) - if job_configuration.scheduling_target != models.SchedulingTarget.Any: core_job_operations.create_task_table(job_configuration.id) diff --git a/aztk/utils/helpers.py b/aztk/utils/helpers.py index 0c213ffa..184d08ac 100644 --- a/aztk/utils/helpers.py +++ b/aztk/utils/helpers.py @@ -24,13 +24,6 @@ def is_gpu_enabled(vm_size: str): return bool(re.search("nv|nc", vm_size, flags=re.IGNORECASE)) -def get_cluster(cluster_id, batch_client): - pool = batch_client.pool.get(cluster_id) - nodes = batch_client.compute_node.list(pool_id=cluster_id) - - return aztk.models.Cluster(pool, nodes) - - def wait_for_tasks_to_complete(job_id, batch_client): """ Waits for all the tasks in a particular job to complete. From 9a96c067d196ad3f5e7518caf8d2bc698b09a7ea Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 7 Nov 2018 12:53:38 -0800 Subject: [PATCH 17/28] start refactor of batch functions --- aztk/client/base/base_operations.py | 16 ++++------------ .../base/helpers/create_batch_resources.py | 2 ++ aztk/client/base/helpers/get_recent_job.py | 9 --------- aztk/client/base/helpers/get_task.py | 1 - aztk/client/base/helpers/list_tasks.py | 11 +++-------- aztk/client/cluster/helpers/create.py | 1 + aztk/client/cluster/helpers/get.py | 11 +++-------- aztk/client/job/helpers/submit.py | 5 +++-- aztk/spark/client/job/helpers/delete.py | 10 +++++----- aztk/spark/client/job/helpers/get.py | 17 +++++------------ .../client/job/helpers/get_application_log.py | 6 +----- aztk/spark/client/job/helpers/list.py | 7 ++++--- .../client/job/helpers/list_applications.py | 8 ++------ aztk/spark/client/job/helpers/stop.py | 6 ++---- .../client/job/helpers/stop_application.py | 4 ++-- .../client/job/helpers/wait_until_complete.py | 6 +++--- aztk/spark/models/models.py | 12 ++++++------ 17 files changed, 46 insertions(+), 86 deletions(-) delete mode 100644 aztk/client/base/helpers/get_recent_job.py diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index fe831b92..0693c5d5 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -3,8 +3,8 @@ from .helpers import (create_batch_resources, create_user_on_cluster, create_user_on_node, delete_user_on_cluster, delete_user_on_node, generate_user_on_cluster, generate_user_on_node, get_application_log, - get_recent_job, get_remote_login_settings, get_task, get_task_state, list_tasks, node_run, run, - ssh_into_node, task_table) + get_remote_login_settings, get_task, get_task_state, list_tasks, node_run, run, ssh_into_node, + task_table) class BaseOperations: @@ -89,6 +89,7 @@ def create_batch_resources( size_dedicated, size_low_priority, subnet_id, + job_metadata, ): """Create the underlying batch resources for a cluster or a job Args: @@ -110,6 +111,7 @@ def create_batch_resources( size_dedicated, size_low_priority, subnet_id, + job_metadata, ) def create_user_on_node(self, id, node_id, username, ssh_key=None, password=None): @@ -263,16 +265,6 @@ def get_application_log(self, id: str, application_name: str, tail=False, curren """ return get_application_log.get_application_log(self, id, application_name, tail, current_bytes) - def get_recent_job(self, id): - """Get the most recently run job in an Azure Batch job schedule - - Args: - id (:obj:`str`): the id of the job schedule - Returns: - :obj:`[azure.batch.models.Job]`: the most recently run job on the job schedule - """ - return get_recent_job.get_recent_job(self, id) - def create_task_table(self, id: str): """Create an Azure Table Storage to track tasks diff --git a/aztk/client/base/helpers/create_batch_resources.py b/aztk/client/base/helpers/create_batch_resources.py index 8f9bb1f5..77395a71 100644 --- a/aztk/client/base/helpers/create_batch_resources.py +++ b/aztk/client/base/helpers/create_batch_resources.py @@ -18,6 +18,7 @@ def create_batch_resources( size_dedicated, size_low_priority, subnet_id, + job_metadata, ): autoscale_formula = "$TargetDedicatedNodes = {0}; " "$TargetLowPriorityNodes = {1}".format( size_dedicated, size_low_priority) @@ -58,6 +59,7 @@ def create_batch_resources( pool_info=batch_models.PoolInformation(auto_pool_specification=auto_pool_specification), job_manager_task=job_manager_task, on_all_tasks_complete=on_all_tasks_complete, + metadata=job_metadata, ) return batch_client.job.add(job) diff --git a/aztk/client/base/helpers/get_recent_job.py b/aztk/client/base/helpers/get_recent_job.py deleted file mode 100644 index 9dd4e945..00000000 --- a/aztk/client/base/helpers/get_recent_job.py +++ /dev/null @@ -1,9 +0,0 @@ -from aztk.utils import batch_error_manager - - -# Note: this only works with jobs, not clusters -# cluster impl is planned to change to job schedule -def get_recent_job(core_job_operations, id): - with batch_error_manager(): - job_schedule = core_job_operations.batch_client.job_schedule.get(id) - return core_job_operations.batch_client.job.get(job_schedule.execution_info.recent_job.id) diff --git a/aztk/client/base/helpers/get_task.py b/aztk/client/base/helpers/get_task.py index 3f64894c..a14a701f 100644 --- a/aztk/client/base/helpers/get_task.py +++ b/aztk/client/base/helpers/get_task.py @@ -1,6 +1,5 @@ from aztk.models import SchedulingTarget -from .get_recent_job import get_recent_job from .task_table import get_batch_task, get_task_from_table diff --git a/aztk/client/base/helpers/list_tasks.py b/aztk/client/base/helpers/list_tasks.py index 8dec0583..1e0f8554 100644 --- a/aztk/client/base/helpers/list_tasks.py +++ b/aztk/client/base/helpers/list_tasks.py @@ -2,14 +2,13 @@ from aztk.models import SchedulingTarget -from .get_recent_job import get_recent_job from .task_table import list_batch_tasks, list_task_table_entries def list_tasks(core_base_operations, id): """List all tasks on a job or cluster - This will work for both Batch scheduling and scheduling_target + This works for both Batch scheduling and scheduling_target Args: id: cluster or job id @@ -21,9 +20,5 @@ def list_tasks(core_base_operations, id): if scheduling_target is not SchedulingTarget.Any: return list_task_table_entries(core_base_operations.table_service, id) else: - try: - recent_run_job = get_recent_job(core_base_operations, id) - tasks = list_batch_tasks(batch_client=core_base_operations.batch_client, id=recent_run_job.id) - except BatchErrorException: - tasks = list_batch_tasks(batch_client=core_base_operations.batch_client, id=id) - return tasks + job = core_base_operations.job.get(id) + return list_batch_tasks(batch_client=core_base_operations.batch_client, id=id) diff --git a/aztk/client/cluster/helpers/create.py b/aztk/client/cluster/helpers/create.py index 7c5a3e16..058816cc 100644 --- a/aztk/client/cluster/helpers/create.py +++ b/aztk/client/cluster/helpers/create.py @@ -35,6 +35,7 @@ def create_pool_and_job_and_table( size_dedicated=cluster_conf.size, size_low_priority=cluster_conf.size_low_priority, subnet_id=cluster_conf.subnet_id, + job_metadata=None, ) # create storage task table diff --git a/aztk/client/cluster/helpers/get.py b/aztk/client/cluster/helpers/get.py index e46d4911..22cb26ad 100644 --- a/aztk/client/cluster/helpers/get.py +++ b/aztk/client/cluster/helpers/get.py @@ -1,15 +1,10 @@ -import azure.batch.models as batch_models - from aztk import error, models def convert_job_id_to_pool_id(batch_client, cluster_id): - jobs = batch_client.pool.list(pool_list_options=batch_models.PoolListOptions(filter="id eq {}".format(cluster_id))) - job = next(jobs) - assert str.split('_', job.id)[0] == cluster_id - print(job.pool_info.auto_pool_specification.pool.__dict__) - if job.pool_info: - return job.pool_info.pool_id + job = batch_client.job.get(cluster_id) + if job.execution_info and job.execution_info.pool_id: + return job.execution_info.pool_id raise error.AztkError("No cluster with id {} does not exist.".format(cluster_id)) def get_pool_details(core_cluster_operations, cluster_id: str): diff --git a/aztk/client/job/helpers/submit.py b/aztk/client/job/helpers/submit.py index 8c18dacf..5f5a88e4 100644 --- a/aztk/client/job/helpers/submit.py +++ b/aztk/client/job/helpers/submit.py @@ -30,7 +30,7 @@ def submit_job( job_configuration.to_cluster_config()) core_job_operations.create_batch_resources( - id=job_configuration.cluster_id, + id=job_configuration.id, start_task=start_task, job_manager_task=job_manager_task, vm_size=job_configuration.vm_size, @@ -41,9 +41,10 @@ def submit_job( size_dedicated=job_configuration.max_dedicated_nodes, size_low_priority=job_configuration.max_low_pri_nodes, subnet_id=job_configuration.subnet_id, + job_metadata=[batch_models.MetadataItem(name="applications", value=application_metadata)], ) if job_configuration.scheduling_target != models.SchedulingTarget.Any: core_job_operations.create_task_table(job_configuration.id) - return core_job_operations.batch_client.job_schedule.get(job_schedule_id=job_configuration.id) + return core_job_operations.batch_client.job.get(job_id=job_configuration.id) diff --git a/aztk/spark/client/job/helpers/delete.py b/aztk/spark/client/job/helpers/delete.py index 89c29924..84b88982 100644 --- a/aztk/spark/client/job/helpers/delete.py +++ b/aztk/spark/client/job/helpers/delete.py @@ -6,12 +6,12 @@ def _delete(core_job_operations, spark_job_operations, job_id, keep_logs: bool = False): - deleted_job_schedule = False + deleted_job = False - # delete job_schedule + # delete batch job try: - core_job_operations.batch_client.job_schedule.delete(job_id) - deleted_job_schedule = True + core_job_operations.batch_client.job.delete(job_id) + deleted_job = True except BatchErrorException: pass @@ -24,7 +24,7 @@ def _delete(core_job_operations, spark_job_operations, job_id, keep_logs: bool = if table_exists: core_job_operations.delete_task_table(job_id) - return deleted_job_schedule + return deleted_job @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) diff --git a/aztk/spark/client/job/helpers/get.py b/aztk/spark/client/job/helpers/get.py index 490db706..7a90965c 100644 --- a/aztk/spark/client/job/helpers/get.py +++ b/aztk/spark/client/job/helpers/get.py @@ -1,28 +1,21 @@ -from azure.batch.models import BatchErrorException - from aztk import error from aztk.spark import models -from aztk.utils import helpers +from aztk.utils import batch_error_manager def _get_job(core_job_operations, job_id): - job = core_job_operations.batch_client.job_schedule.get(job_id) + job = core_job_operations.batch_client.job.get(job_id) tasks = [app for app in core_job_operations.list_tasks(id=job_id) if app.id != job_id] - recent_run_job = core_job_operations.get_recent_job(job_id) - pool_prefix = recent_run_job.pool_info.auto_pool_specification.auto_pool_id_prefix + with batch_error_manager(): + pool = core_job_operations.batch_client.pool.get(job.execution_info.pool_id) pool = nodes = None - for cloud_pool in core_job_operations.batch_client.pool.list(): - if pool_prefix in cloud_pool.id: - pool = cloud_pool - break + if pool: nodes = core_job_operations.batch_client.compute_node.list(pool_id=pool.id) return job, tasks, pool, nodes def get_job(core_job_operations, job_id): - from aztk.utils import batch_error_manager with batch_error_manager(): job, tasks, pool, nodes = _get_job(core_job_operations, job_id) return models.Job(job, tasks, pool, nodes) - diff --git a/aztk/spark/client/job/helpers/get_application_log.py b/aztk/spark/client/job/helpers/get_application_log.py index f5d1ac2f..538749e2 100644 --- a/aztk/spark/client/job/helpers/get_application_log.py +++ b/aztk/spark/client/job/helpers/get_application_log.py @@ -10,12 +10,8 @@ def _get_application_log(core_job_operations, spark_job_operations, job_id, appl if scheduling_target is not models.SchedulingTarget.Any: return core_job_operations.get_application_log(job_id, application_name) - # TODO: change where the logs are uploaded so they aren't overwritten on scheduled runs - # current: job_id, application_name/output.log - # new: job_id, recent_run_job.id/application_name/output.log - recent_run_job = core_job_operations.get_recent_job(job_id) try: - task = core_job_operations.get_batch_task(id=recent_run_job.id, task_id=application_name) + task = core_job_operations.get_batch_task(id=job_id, task_id=application_name) except batch_models.BatchErrorException as e: # task may not exist since it may not yet be scheduled # see if the task is written to metadata of pool diff --git a/aztk/spark/client/job/helpers/list.py b/aztk/spark/client/job/helpers/list.py index 4592b19d..34eb63f5 100644 --- a/aztk/spark/client/job/helpers/list.py +++ b/aztk/spark/client/job/helpers/list.py @@ -5,11 +5,12 @@ from aztk.utils import helpers -def _list_jobs(core_job_operations): - return [cloud_job_schedule for cloud_job_schedule in core_job_operations.batch_client.job_schedule.list()] +def filter_aztk_jobs(jobs): + #TODO: filter by metadata + return jobs def list_jobs(core_job_operations): from aztk.utils import batch_error_manager with batch_error_manager(): - return [models.Job(cloud_job_schedule) for cloud_job_schedule in _list_jobs(core_job_operations)] + return [models.Job(job) for job in filter_aztk_jobs(core_job_operations.batch_client.job.list())] diff --git a/aztk/spark/client/job/helpers/list_applications.py b/aztk/spark/client/job/helpers/list_applications.py index 72578979..aaab7833 100644 --- a/aztk/spark/client/job/helpers/list_applications.py +++ b/aztk/spark/client/job/helpers/list_applications.py @@ -1,15 +1,11 @@ -from azure.batch.models import BatchErrorException - -from aztk import error from aztk.spark import models -from aztk.utils import helpers def _list_applications(core_job_operations, job_id): - recent_run_job = core_job_operations.get_recent_job(job_id) + job = core_job_operations.batch_client.job.get(job_id) # get application names from Batch job metadata applications = {} - for metadata_item in recent_run_job.metadata: + for metadata_item in job.metadata: if metadata_item.name == "applications": for app_name in metadata_item.value.split("\n"): applications[app_name] = None diff --git a/aztk/spark/client/job/helpers/stop.py b/aztk/spark/client/job/helpers/stop.py index efce8c44..3d701473 100644 --- a/aztk/spark/client/job/helpers/stop.py +++ b/aztk/spark/client/job/helpers/stop.py @@ -3,10 +3,8 @@ def _stop(core_job_operations, job_id): # terminate currently running job and tasks - recent_run_job = core_job_operations.get_recent_job(job_id) - core_job_operations.batch_client.job.terminate(recent_run_job.id) - # terminate job_schedule - core_job_operations.batch_client.job_schedule.terminate(job_id) + job = core_job_operations.batch_client.job.get(job_id) + core_job_operations.batch_client.job.terminate(job.id) def stop(self, job_id): diff --git a/aztk/spark/client/job/helpers/stop_application.py b/aztk/spark/client/job/helpers/stop_application.py index e770316e..9abebf10 100644 --- a/aztk/spark/client/job/helpers/stop_application.py +++ b/aztk/spark/client/job/helpers/stop_application.py @@ -2,11 +2,11 @@ def stop_app(core_job_operations, job_id, application_name): - recent_run_job = core_job_operations.get_recent_job(job_id) + job = core_job_operations.batch_client.job.get(job_id) # stop batch task try: - core_job_operations.batch_client.task.terminate(job_id=recent_run_job.id, task_id=application_name) + core_job_operations.batch_client.task.terminate(job_id=job.id, task_id=application_name) return True except BatchErrorException: return False diff --git a/aztk/spark/client/job/helpers/wait_until_complete.py b/aztk/spark/client/job/helpers/wait_until_complete.py index e3036088..a5a79d49 100644 --- a/aztk/spark/client/job/helpers/wait_until_complete.py +++ b/aztk/spark/client/job/helpers/wait_until_complete.py @@ -6,11 +6,11 @@ def _wait_until_job_finished(core_job_operations, job_id): - job_state = core_job_operations.batch_client.job_schedule.get(job_id).state + job_state = core_job_operations.batch_client.job.get(job_id).state - while job_state not in [batch_models.JobScheduleState.completed, batch_models.JobScheduleState.terminating]: + while job_state not in [batch_models.JobState.completed, batch_models.JobState.terminating]: time.sleep(3) - job_state = core_job_operations.batch_client.job_schedule.get(job_id).state + job_state = core_job_operations.batch_client.job.get(job_id).state def wait_until_job_finished(core_job_operations, job_id): diff --git a/aztk/spark/models/models.py b/aztk/spark/models/models.py index 746fec10..0db7c9c0 100644 --- a/aztk/spark/models/models.py +++ b/aztk/spark/models/models.py @@ -261,16 +261,16 @@ class JobState(Enum): class Job: def __init__( self, - cloud_job_schedule: batch_models.CloudJobSchedule, + cloud_job: batch_models.CloudJob, tasks: List[aztk.models.Task] = None, pool: batch_models.CloudPool = None, nodes: batch_models.ComputeNodePaged = None, ): - self.id = cloud_job_schedule.id - self.last_modified = cloud_job_schedule.last_modified - self.state = JobState(cloud_job_schedule.state.name) - self.state_transition_time = cloud_job_schedule.state_transition_time - self.creation_time = cloud_job_schedule.creation_time + self.id = cloud_job.id + self.last_modified = cloud_job.last_modified + self.state = JobState(cloud_job.state.name) + self.state_transition_time = cloud_job.state_transition_time + self.creation_time = cloud_job.creation_time self.applications = [Application(task) for task in (tasks or [])] if pool: self.cluster = Cluster(aztk.models.Cluster(pool, nodes)) From 53065f8295d3593136384e2391f53a08f795764a Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 7 Nov 2018 18:25:14 -0800 Subject: [PATCH 18/28] align cluster and job functions --- aztk/client/base/base_operations.py | 22 +++++++--- .../base/helpers/create_batch_resources.py | 11 +++-- .../base/helpers/delete_batch_resources.py | 29 ++++++++++++ .../base/helpers/get_application_log.py | 31 +++---------- aztk/client/base/helpers/get_task_state.py | 7 +-- aztk/client/base/helpers/list_tasks.py | 1 - aztk/client/base/helpers/node_run.py | 9 ++-- aztk/client/base/helpers/task_table.py | 4 +- aztk/client/cluster/helpers/create.py | 5 ++- aztk/client/cluster/helpers/delete.py | 43 ------------------ aztk/client/cluster/operations.py | 16 +------ aztk/client/job/helpers/submit.py | 3 +- aztk/models/task_state.py | 1 + aztk/node_scripts/core/logger.py | 3 ++ aztk/node_scripts/core/utils.py | 17 +++++++ aztk/node_scripts/install/install.py | 24 +++++----- aztk/node_scripts/install/pick_master.py | 34 +++++--------- aztk/node_scripts/install/spark.py | 44 ++++++------------- aztk/node_scripts/main.py | 1 - .../node_scripts/scheduling/job_submission.py | 33 ++++++-------- .../wait_until_master_selected.py | 5 +-- aztk/spark/client/cluster/helpers/delete.py | 2 +- aztk/spark/client/cluster/helpers/submit.py | 2 +- aztk/spark/client/job/helpers/delete.py | 34 ++------------ .../client/job/helpers/list_applications.py | 2 +- aztk/spark/client/job/operations.py | 2 +- aztk_cli/entrypoint.py | 3 ++ aztk_cli/utils.py | 2 +- 28 files changed, 152 insertions(+), 238 deletions(-) create mode 100644 aztk/client/base/helpers/delete_batch_resources.py delete mode 100644 aztk/client/cluster/helpers/delete.py create mode 100644 aztk/node_scripts/core/utils.py diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 0693c5d5..f500f1df 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -1,10 +1,10 @@ from aztk import models from aztk.internal import cluster_data -from .helpers import (create_batch_resources, create_user_on_cluster, create_user_on_node, delete_user_on_cluster, - delete_user_on_node, generate_user_on_cluster, generate_user_on_node, get_application_log, - get_remote_login_settings, get_task, get_task_state, list_tasks, node_run, run, ssh_into_node, - task_table) +from .helpers import (create_batch_resources, create_user_on_cluster, create_user_on_node, delete_batch_resources, + delete_user_on_cluster, delete_user_on_node, generate_user_on_cluster, generate_user_on_node, + get_application_log, get_remote_login_settings, get_task, get_task_state, list_tasks, node_run, + run, ssh_into_node, task_table) class BaseOperations: @@ -86,6 +86,7 @@ def create_batch_resources( on_all_tasks_complete, mixed_mode, software_metadata_key, + mode_metadata_key, size_dedicated, size_low_priority, subnet_id, @@ -108,12 +109,23 @@ def create_batch_resources( on_all_tasks_complete, mixed_mode, software_metadata_key, + mode_metadata_key, size_dedicated, size_low_priority, subnet_id, job_metadata, ) + def delete_batch_resources(self, id, keep_logs): + """Delete the underlying batch resources for a cluster or a job + Args: + ... + Returns: + ... + """ + + return delete_batch_resources.delete_batch_resources(self, id, keep_logs) + def create_user_on_node(self, id, node_id, username, ssh_key=None, password=None): """Create a user on a node @@ -338,4 +350,4 @@ def get_task(self, id: str, task_id: str): Returns: :obj:`[aztk.models.Task]`: the submitted task with id task_id """ - get_task.get_task(self, id, task_id) + return get_task.get_task(self, id, task_id) diff --git a/aztk/client/base/helpers/create_batch_resources.py b/aztk/client/base/helpers/create_batch_resources.py index 77395a71..13382111 100644 --- a/aztk/client/base/helpers/create_batch_resources.py +++ b/aztk/client/base/helpers/create_batch_resources.py @@ -15,6 +15,7 @@ def create_batch_resources( on_all_tasks_complete, mixed_mode, software_metadata_key, + mode_metadata_key, size_dedicated, size_low_priority, subnet_id, @@ -49,9 +50,8 @@ def create_batch_resources( metadata=[ batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA), - ], - ), + name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA) + ]), ) job = batch_models.JobAddParameter( @@ -59,7 +59,10 @@ def create_batch_resources( pool_info=batch_models.PoolInformation(auto_pool_specification=auto_pool_specification), job_manager_task=job_manager_task, on_all_tasks_complete=on_all_tasks_complete, - metadata=job_metadata, + metadata=[ + batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), + batch_models.MetadataItem(name=constants.AZTK_MODE_METADATA_KEY, value=mode_metadata_key) + ] + job_metadata, ) return batch_client.job.add(job) diff --git a/aztk/client/base/helpers/delete_batch_resources.py b/aztk/client/base/helpers/delete_batch_resources.py new file mode 100644 index 00000000..b3c017d3 --- /dev/null +++ b/aztk/client/base/helpers/delete_batch_resources.py @@ -0,0 +1,29 @@ +from azure.batch.models import BatchErrorException +from msrest.exceptions import ClientRequestError + +from aztk.utils import BackOffPolicy, helpers, retry + + +@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) +def delete_batch_resources(core_base_operations, job_id, keep_logs: bool = False): + success = False + + # delete batch job, autopool + try: + core_base_operations.batch_client.job.delete(job_id) + success = True + except BatchErrorException: + pass + + # delete storage container + if not keep_logs: + cluster_data = core_base_operations.get_cluster_data(job_id) + cluster_data.delete_container(job_id) + success = True + + table_exists = core_base_operations.table_service.exists(job_id) + if table_exists: + core_base_operations.delete_task_table(job_id) + success = True + + return success diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index 9bbadf89..3174f027 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -13,25 +13,11 @@ def convert_application_name_to_blob_path(application_name): return application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE -def wait_for_batch_task(base_operations, cluster_id: str, application_name: str) -> Task: - """ - Wait for the batch task to leave the waiting state into running(or completed if it was fast enough) - """ - - while True: - task_state = base_operations.get_task_state(cluster_id, application_name) - - if task_state in [batch_models.TaskState.active, batch_models.TaskState.preparing]: - # TODO: log - time.sleep(5) - else: - return base_operations.get_batch_task(id=cluster_id, task_id=application_name) - - -def wait_for_scheduling_target_task(base_operations, cluster_id, application_name): +def wait_for_task(base_operations, cluster_id, application_name): # TODO: ensure get_task_state not None or throw task = base_operations.get_task(cluster_id, application_name) while task.state not in [TaskState.Completed, TaskState.Failed, TaskState.Running]: + print(task.state) time.sleep(3) # TODO: enable logger # log.debug("{} {}: application not yet complete".format(cluster_id, application_name)) @@ -39,14 +25,6 @@ def wait_for_scheduling_target_task(base_operations, cluster_id, application_nam return task -def wait_for_task(base_operations, cluster_id: str, application_name: str, cluster_configuration): - if cluster_configuration.scheduling_target is not models.SchedulingTarget.Any: - task = wait_for_scheduling_target_task(base_operations, cluster_id, application_name) - else: - task = wait_for_batch_task(base_operations, cluster_id, application_name) - return task - - def get_blob_from_storage(block_blob_client, container_name, application_name, stream, start_range, end_range=None): print(block_blob_client, container_name, application_name, stream, start_range, end_range) previous = 0 @@ -139,14 +117,15 @@ def stream_log_from_storage(base_operations, container_name, application_name, t def get_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): cluster_configuration = base_operations.get_cluster_configuration(cluster_id) - task = wait_for_task(base_operations, cluster_id, application_name, cluster_configuration) + task = wait_for_task(base_operations, cluster_id, application_name) return get_log_from_storage(base_operations.blob_client, cluster_id, application_name, task, current_bytes) def stream_log(base_operations, cluster_id: str, application_name: str): + print("running stream_log") cluster_configuration = base_operations.get_cluster_configuration(cluster_id) - task = wait_for_task(base_operations, cluster_id, application_name, cluster_configuration) + task = wait_for_task(base_operations, cluster_id, application_name) return stream_log_from_storage(base_operations, cluster_id, application_name, task) diff --git a/aztk/client/base/helpers/get_task_state.py b/aztk/client/base/helpers/get_task_state.py index c0069a23..1e574fdf 100644 --- a/aztk/client/base/helpers/get_task_state.py +++ b/aztk/client/base/helpers/get_task_state.py @@ -4,10 +4,5 @@ def get_task_state(core_cluster_operations, cluster_id: str, task_id: str): with batch_error_manager(): - scheduling_target = core_cluster_operations.get_cluster_configuration(cluster_id).scheduling_target - if scheduling_target is not SchedulingTarget.Any: - task = core_cluster_operations.get_task(cluster_id, task_id) - return task.state - else: - task = core_cluster_operations.get_batch_task(cluster_id, task_id) + task = core_cluster_operations.get_task(cluster_id, task_id) return task.state diff --git a/aztk/client/base/helpers/list_tasks.py b/aztk/client/base/helpers/list_tasks.py index 1e0f8554..89fce7c1 100644 --- a/aztk/client/base/helpers/list_tasks.py +++ b/aztk/client/base/helpers/list_tasks.py @@ -20,5 +20,4 @@ def list_tasks(core_base_operations, id): if scheduling_target is not SchedulingTarget.Any: return list_task_table_entries(core_base_operations.table_service, id) else: - job = core_base_operations.job.get(id) return list_batch_tasks(batch_client=core_base_operations.batch_client, id=id) diff --git a/aztk/client/base/helpers/node_run.py b/aztk/client/base/helpers/node_run.py index 02320123..b8c92bfa 100644 --- a/aztk/client/base/helpers/node_run.py +++ b/aztk/client/base/helpers/node_run.py @@ -4,17 +4,16 @@ def node_run(base_client, cluster_id, node_id, command, internal, container_name=None, timeout=None, block=True): cluster = base_client.get(cluster_id) - pool, nodes = cluster.pool, list(cluster.nodes) try: - node = next(node for node in nodes if node.id == node_id) + node = next(node for node in cluster.nodes if node.id == node_id) except StopIteration: raise error.AztkError("Node with id {} not found".format(node_id)) if internal: node_rls = models.RemoteLogin(ip_address=node.ip_address, port="22") else: - node_rls = base_client.get_remote_login_settings(pool.id, node.id) + node_rls = base_client.get_remote_login_settings(cluster.pool.id, node.id) try: - generated_username, ssh_key = base_client.generate_user_on_node(pool.id, node.id) + generated_username, ssh_key = base_client.generate_user_on_node(cluster.pool.id, node.id) output = ssh_lib.node_exec_command( node.id, command, @@ -27,4 +26,4 @@ def node_run(base_client, cluster_id, node_id, command, internal, container_name block=block) return output finally: - base_client.delete_user_on_node(cluster_id, node.id, generated_username) + base_client.delete_user_on_node(cluster.pool.id, node.id, generated_username) diff --git a/aztk/client/base/helpers/task_table.py b/aztk/client/base/helpers/task_table.py index 5d92b80e..0425ba1c 100644 --- a/aztk/client/base/helpers/task_table.py +++ b/aztk/client/base/helpers/task_table.py @@ -42,7 +42,7 @@ def __convert_batch_task_to_aztk_task(batch_task): task.id = batch_task.id if batch_task.node_info: task.node_id = batch_task.node_info.node_id - task.state = batch_task.state + task.state = TaskState(batch_task.state.value) task.state_transition_time = batch_task.state_transition_time task.command_line = batch_task.command_line task.exit_code = batch_task.execution_info.exit_code @@ -90,7 +90,7 @@ def list_task_table_entries(table_service, id): def get_task_from_table(table_service, id, task_id): entity = table_service.get_entity(helpers.convert_id_to_table_id(id), id, task_id) # TODO: enable logger - # print("Running get_task_from_table: {}".format(entity)) + print("Running get_task_from_table: {}".format(entity)) return __convert_entity_to_task(entity) diff --git a/aztk/client/cluster/helpers/create.py b/aztk/client/cluster/helpers/create.py index 058816cc..598f632f 100644 --- a/aztk/client/cluster/helpers/create.py +++ b/aztk/client/cluster/helpers/create.py @@ -1,7 +1,7 @@ import azure.batch.models as batch_models from aztk import models -from aztk.utils import helpers +from aztk.utils import constants def create_pool_and_job_and_table( @@ -32,10 +32,11 @@ def create_pool_and_job_and_table( on_all_tasks_complete=batch_models.OnAllTasksComplete.no_action, mixed_mode=cluster_conf.mixed_mode, software_metadata_key=software_metadata_key, + mode_metadata_key=constants.AZTK_CLUSTER_MODE_METADATA, size_dedicated=cluster_conf.size, size_low_priority=cluster_conf.size_low_priority, subnet_id=cluster_conf.subnet_id, - job_metadata=None, + job_metadata=[], ) # create storage task table diff --git a/aztk/client/cluster/helpers/delete.py b/aztk/client/cluster/helpers/delete.py deleted file mode 100644 index 1cbc13a7..00000000 --- a/aztk/client/cluster/helpers/delete.py +++ /dev/null @@ -1,43 +0,0 @@ -from azure.batch.models import BatchErrorException -from msrest.exceptions import ClientRequestError - -from aztk.utils import BackOffPolicy, retry - - -def delete_pool_and_job_and_table(core_cluster_operations, pool_id: str, keep_logs: bool = False): - """ - Delete a pool and it's associated job - :param cluster_id: the pool to add the user to - :return bool: deleted the pool if exists and job if exists - """ - # job id is equal to pool id - job_exists = True - - try: - core_cluster_operations.batch_client.job.get(pool_id) - except BatchErrorException: - job_exists = False - - pool_exists = core_cluster_operations.batch_client.pool.exists(pool_id) - - table_exists = core_cluster_operations.table_service.exists(pool_id) - - if job_exists: - delete_object(core_cluster_operations.batch_client.job.delete, pool_id) - - if pool_exists: - delete_object(core_cluster_operations.batch_client.pool.delete, pool_id) - - if table_exists: - delete_object(core_cluster_operations.delete_task_table, pool_id) - - if not keep_logs: - cluster_data = core_cluster_operations.get_cluster_data(pool_id) - cluster_data.delete_container(pool_id) - - return job_exists or pool_exists or table_exists - - -@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) -def delete_object(function, *args, **kwargs): - return function(*args, **kwargs) diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index efac874e..5a96e041 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -1,7 +1,7 @@ from aztk.client.base import BaseOperations from aztk.models import ClusterConfiguration -from .helpers import copy, create, delete, get, list, wait_for_task_to_complete +from .helpers import copy, create, get, list, wait_for_task_to_complete class CoreClusterOperations(BaseOperations): @@ -58,20 +58,6 @@ def copy(self, id, source_path, destination_path=None, container_name=None, inte """ return copy.cluster_copy(self, id, source_path, destination_path, container_name, internal, get, timeout) - def delete(self, id: str, keep_logs: bool = False): - """Copy files to or from every node in a cluster. - - Args: - id (:obj:`str`): the id of the cluster to delete - keep_logs (:obj:`bool`): If True, the logs related to this cluster in Azure Storage are not deleted. - Defaults to False. - - Returns: - :obj:`List[aztk.models.NodeOutput]`: - A list of NodeOutput objects representing the output of the copy command. - """ - return delete.delete_pool_and_job_and_table(self, id, keep_logs) - def list(self, software_metadata_key): """List clusters running the specified software. diff --git a/aztk/client/job/helpers/submit.py b/aztk/client/job/helpers/submit.py index 5f5a88e4..c1ec63df 100644 --- a/aztk/client/job/helpers/submit.py +++ b/aztk/client/job/helpers/submit.py @@ -35,9 +35,10 @@ def submit_job( job_manager_task=job_manager_task, vm_size=job_configuration.vm_size, vm_image_model=vm_image_model, - on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, + on_all_tasks_complete=batch_models.OnAllTasksComplete.no_action, mixed_mode=job_configuration.mixed_mode, software_metadata_key=software_metadata_key, + mode_metadata_key=constants.AZTK_JOB_MODE_METADATA, size_dedicated=job_configuration.max_dedicated_nodes, size_low_priority=job_configuration.max_low_pri_nodes, subnet_id=job_configuration.subnet_id, diff --git a/aztk/models/task_state.py b/aztk/models/task_state.py index a76b881b..6dc9a8b0 100644 --- a/aztk/models/task_state.py +++ b/aztk/models/task_state.py @@ -2,6 +2,7 @@ class TaskState(Enum): + Active = "active" Running = "running" Completed = "completed" Failed = "failed" diff --git a/aztk/node_scripts/core/logger.py b/aztk/node_scripts/core/logger.py index d0fc4b90..04003cf1 100644 --- a/aztk/node_scripts/core/logger.py +++ b/aztk/node_scripts/core/logger.py @@ -13,3 +13,6 @@ def setup_logging(): log.setLevel(logging.INFO) logging.basicConfig(stream=sys.stdout, format=VERBOSE_FORMAT) + + +setup_logging() diff --git a/aztk/node_scripts/core/utils.py b/aztk/node_scripts/core/utils.py new file mode 100644 index 00000000..512b381c --- /dev/null +++ b/aztk/node_scripts/core/utils.py @@ -0,0 +1,17 @@ +import azure.batch.models as batchmodels + +from aztk.node_scripts.core import config + + +def get_pool() -> batchmodels.CloudPool: + return config.batch_client.pool.get(config.pool_id) + + +def get_node(node_id: str) -> batchmodels.ComputeNode: + return config.batch_client.compute_node.get(config.pool_id, node_id) + + +def get_master_node_id(cluster_id) -> str: + cluster = config.spark_client.cluster.get(cluster_id) + if cluster.master_node_id: + return cluster.master_node_id diff --git a/aztk/node_scripts/install/install.py b/aztk/node_scripts/install/install.py index 0488a688..017889ea 100644 --- a/aztk/node_scripts/install/install.py +++ b/aztk/node_scripts/install/install.py @@ -20,27 +20,20 @@ def setup_host(docker_repo: str, docker_run_options: str): :param docker_repo: location of the Docker image to use :param docker_run_options: additional command-line options to pass to docker run """ - client = config.batch_client - - create_user.create_user(batch_client=client) + create_user.create_user(batch_client=config.batch_client) if os.environ["AZ_BATCH_NODE_IS_DEDICATED"] == "true" or os.environ["AZTK_MIXED_MODE"] == "false": - is_master = pick_master.find_master(client) + is_master = pick_master.find_master(config.batch_client) else: is_master = False wait_until_master_selected.main() is_worker = not is_master or os.environ.get("AZTK_WORKER_ON_MASTER") == "true" - master_node_id = pick_master.get_master_node_id(config.batch_client.pool.get(config.pool_id)) - master_node = config.batch_client.compute_node.get(config.pool_id, master_node_id) - if is_master: - os.environ["AZTK_IS_MASTER"] = "true" - else: - os.environ["AZTK_IS_MASTER"] = "false" - if is_worker: - os.environ["AZTK_IS_WORKER"] = "true" - else: - os.environ["AZTK_IS_WORKER"] = "false" + cluster = config.spark_client.cluster.get(id=config.cluster_id) + master_node = config.batch_client.compute_node.get(config.pool_id, cluster.master_node_id) + + os.environ["AZTK_IS_MASTER"] = "true" if is_master else "false" + os.environ["AZTK_IS_WORKER"] = "true" if is_worker else "true" os.environ["AZTK_MASTER_IP"] = master_node.ip_address @@ -78,4 +71,7 @@ def setup_spark_container(): plugins.setup_plugins(target=PluginTarget.SparkContainer, is_master=is_master, is_worker=is_worker) + # TODO: this is a good candidate for a lock. + # this function holds lock until completion, + # poller wait to aquire lock open("/tmp/setup_complete", "a").close() diff --git a/aztk/node_scripts/install/pick_master.py b/aztk/node_scripts/install/pick_master.py index 5046191e..6364041d 100644 --- a/aztk/node_scripts/install/pick_master.py +++ b/aztk/node_scripts/install/pick_master.py @@ -6,6 +6,7 @@ from azure.batch.models import BatchErrorException from msrest.exceptions import ClientRequestError +import aztk.models from aztk.node_scripts.core import config, log MASTER_NODE_METADATA_KEY = "_spark_master_node" @@ -15,29 +16,15 @@ class CannotAllocateMasterError(Exception): pass -def get_master_node_id(pool: batchmodels.CloudPool): - """ - :returns: the id of the node that is the assigned master of this pool - """ - if pool.metadata is None: - return None - - for metadata in pool.metadata: - if metadata.name == MASTER_NODE_METADATA_KEY: - return metadata.value - - return None - - -def try_assign_self_as_master(client: batch.BatchServiceClient, pool: batchmodels.CloudPool): - current_metadata = pool.metadata or [] +def try_assign_self_as_master(client: batch.BatchServiceClient, cluster: aztk.models.Cluster): + current_metadata = cluster.pool.metadata or [] new_metadata = current_metadata + [{"name": MASTER_NODE_METADATA_KEY, "value": config.node_id}] try: client.pool.patch( config.pool_id, batchmodels.PoolPatchParameter(metadata=new_metadata), - batchmodels.PoolPatchOptions(if_match=pool.e_tag), + batchmodels.PoolPatchOptions(if_match=cluster.pool.e_tag), ) return True except (BatchErrorException, ClientRequestError): @@ -56,19 +43,18 @@ def find_master(client: batch.BatchServiceClient) -> bool: # return False for i in range(0, 5): - pool = client.pool.get(config.pool_id) - master = get_master_node_id(pool) + cluster = config.spark_client.cluster.get(config.cluster_id) - if master: - if master == config.node_id: - log.info("Node is already the master '{0}'".format(master)) + if cluster.master_node_id: + if cluster.master_node_id == config.node_id: + log.info("Node is already the master '{0}'".format(cluster.master_node_id)) return True else: - log.info("Pool already has a master '{0}'. This node will be a worker".format(master)) + log.info("Pool already has a master '{0}'. This node will be a worker".format(cluster.master_node_id)) return False else: log.info("Pool has no master. Trying to assign itself! ({0}/5)".format(i + 1)) - result = try_assign_self_as_master(client, pool) + result = try_assign_self_as_master(client, cluster) if result: log.info("Assignment was successful! Node {0} is the new master.".format(config.node_id)) diff --git a/aztk/node_scripts/install/spark.py b/aztk/node_scripts/install/spark.py index a1ad2ff0..104b37ea 100644 --- a/aztk/node_scripts/install/spark.py +++ b/aztk/node_scripts/install/spark.py @@ -10,7 +10,7 @@ import azure.batch.models as batchmodels -from aztk.node_scripts.core import config +from aztk.node_scripts.core import config, log, utils from aztk.node_scripts.install import pick_master batch_client = config.batch_client @@ -31,29 +31,12 @@ def setup_as_worker(): start_spark_worker() -def get_pool() -> batchmodels.CloudPool: - return batch_client.pool.get(config.pool_id) - - -def get_node(node_id: str) -> batchmodels.ComputeNode: - return batch_client.compute_node.get(config.pool_id, node_id) - - -def list_nodes() -> List[batchmodels.ComputeNode]: - """ - List all the nodes in the pool. - """ - # TODO use continuation token & verify against current/target dedicated of - # pool - return batch_client.compute_node.list(config.pool_id) - - def setup_connection(): """ This setup spark config with which nodes are slaves and which are master """ - master_node_id = pick_master.get_master_node_id(batch_client.pool.get(config.pool_id)) - master_node = get_node(master_node_id) + master_node_id = utils.get_master_node_id(config.cluster_id) + master_node = utils.get_node(master_node_id) master_config_file = os.path.join(spark_conf_folder, "master") master_file = open(master_config_file, "w", encoding="UTF-8") @@ -66,13 +49,13 @@ def setup_connection(): def wait_for_master(): print("Waiting for master to be ready.") - master_node_id = pick_master.get_master_node_id(batch_client.pool.get(config.pool_id)) + master_node_id = utils.get_master_node_id(config.cluster_id) if master_node_id == config.node_id: return while True: - master_node = get_node(master_node_id) + master_node = utils.get_node(master_node_id) if master_node.state in [batchmodels.ComputeNodeState.idle, batchmodels.ComputeNodeState.running]: break @@ -82,7 +65,7 @@ def wait_for_master(): def start_spark_master(): - master_ip = get_node(config.node_id).ip_address + master_ip = utils.get_node(config.node_id).ip_address exe = os.path.join(spark_home, "sbin", "start-master.sh") cmd = [exe, "-h", master_ip, "--webui-port", str(config.spark_web_ui_port)] print("Starting master with '{0}'".format(" ".join(cmd))) @@ -97,8 +80,7 @@ def start_spark_master(): def start_spark_worker(): wait_for_master() exe = os.path.join(spark_home, "sbin", "start-slave.sh") - master_node_id = pick_master.get_master_node_id(batch_client.pool.get(config.pool_id)) - master_node = get_node(master_node_id) + master_node = utils.get_node(utils.get_master_node_id(config.cluster_id)) cmd = [exe, "spark://{0}:7077".format(master_node.ip_address), "--webui-port", str(config.spark_worker_ui_port)] print("Connecting to master with '{0}'".format(" ".join(cmd))) @@ -169,9 +151,9 @@ def copy_jars(): dest = os.path.join(spark_default_path_dest, jar) print("copy {} to {}".format(src, dest)) copyfile(src, dest) - except Exception as e: - print("Failed to copy jar files with error:") - print(e) + except Exception: + import traceback + print("Failed to copy jar files with error: {}".format(traceback.format_exc())) def parse_configuration_file(path_to_file: str): @@ -183,9 +165,9 @@ def parse_configuration_file(path_to_file: str): split = line.split() properties[split[0]] = split[1] return properties - except Exception as e: - print("Failed to parse configuration file:", path_to_file, "with error:") - print(e) + except Exception: + import traceback + log.print("Failed to parse configuration file: {} with error {}".format(path_to_file, traceback.format_exc())) def start_history_server(): diff --git a/aztk/node_scripts/main.py b/aztk/node_scripts/main.py index 91914201..16b808b1 100644 --- a/aztk/node_scripts/main.py +++ b/aztk/node_scripts/main.py @@ -20,5 +20,4 @@ def run(): if __name__ == "__main__": - logger.setup_logging() run() diff --git a/aztk/node_scripts/scheduling/job_submission.py b/aztk/node_scripts/scheduling/job_submission.py index c6e24c09..ddcb804d 100644 --- a/aztk/node_scripts/scheduling/job_submission.py +++ b/aztk/node_scripts/scheduling/job_submission.py @@ -5,8 +5,7 @@ import azure.batch.models as batch_models import yaml -from aztk.node_scripts.core import config -from aztk.node_scripts.install.pick_master import get_master_node_id +from aztk.node_scripts.core import config, log, utils from aztk.node_scripts.scheduling import common, scheduling_target from aztk.spark.models import ApplicationState from aztk.utils import constants @@ -24,14 +23,13 @@ def read_downloaded_tasks(): try: tasks.append(yaml.load(stream)) except yaml.YAMLError as exc: - print(exc) + log.info(exc) return tasks def affinitize_task_to_master(batch_client, cluster_id, task): - pool = batch_client.pool.get(config.pool_id) - master_node_id = get_master_node_id(pool) - master_node = batch_client.compute_node.get(pool_id=cluster_id, node_id=master_node_id) + cluster = config.spark_client.cluster.get_cluster(id=cluster_id) + master_node = batch_client.compute_node.get(pool_id=cluster_id, node_id=cluster.master_node_id) task.affinity_info = batch_models.AffinityInformation(affinity_id=master_node.affinity_id) return task @@ -40,13 +38,9 @@ def schedule_tasks(tasks): """ Handle the request to submit a task """ - batch_client = config.batch_client - for task in tasks: - # affinitize task to master - task = affinitize_task_to_master(batch_client, os.environ["AZ_BATCH_POOL_ID"], task) - # schedule the task - batch_client.task.add(job_id=os.environ["AZ_BATCH_JOB_ID"], task=task) + task = affinitize_task_to_master(config.batch_client, config.cluster_id, task) + config.batch_client.task.add(job_id=config.job_id, task=task) def select_scheduling_target_node(spark_cluster_operations, cluster_id, scheduling_target): @@ -61,7 +55,6 @@ def schedule_with_target(scheduling_target, task_sas_urls): for task_sas_url in task_sas_urls: task_definition = common.download_task_definition(task_sas_url) task_working_dir = "/mnt/aztk/startup/tasks/workitems/{}".format(task_definition.id) - aztk_cluster_id = os.environ.get("AZTK_CLUSTER_ID") task_cmd = ( r"source ~/.bashrc; " r"mkdir -p {0};" @@ -70,12 +63,12 @@ def schedule_with_target(scheduling_target, task_sas_urls): r"export STORAGE_LOGS_CONTAINER={1};" r"cd $AZ_BATCH_TASK_WORKING_DIR; " r'$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python $AZTK_WORKING_DIR/aztk/node_scripts/scheduling/submit.py "{2}" >> {3} 2>&1'. - format(task_working_dir, aztk_cluster_id, task_sas_url, constants.SPARK_SUBMIT_LOGS_FILE)) - node_id = select_scheduling_target_node(config.spark_client.cluster, config.pool_id, scheduling_target) + format(task_working_dir, config.cluster_id, task_sas_url, constants.SPARK_SUBMIT_LOGS_FILE)) + node_id = select_scheduling_target_node(config.spark_client.cluster, config.cluster_id, scheduling_target) node_run_output = config.spark_client.cluster.node_run( - config.pool_id, node_id, task_cmd, timeout=120, block=False, internal=True) + config.cluster_id, node_id, task_cmd, timeout=120, block=False, internal=True) # block job_manager_task until scheduling_target task completion - wait_until_tasks_complete(aztk_cluster_id) + wait_until_tasks_complete(config.cluster_id) def wait_until_tasks_complete(id): @@ -83,6 +76,8 @@ def wait_until_tasks_complete(id): applications = config.spark_client.job.list_applications(id=id) for application_id in applications: if not applications[application_id]: + log.info("not all applications completed: %s", application_id) + print("not all applications completed: {}".format(application_id)) time.sleep(3) config.spark_client.job.list_applications(id=id) break @@ -99,10 +94,10 @@ def wait_until_tasks_complete(id): scheduling_target = None if scheduling_target: - print("scheduling with target") + log.info("scheduling with target") task_sas_urls = [task_sas_url for task_sas_url in sys.argv[2:]] schedule_with_target(scheduling_target, task_sas_urls) else: - print("scheduling with batch") + log.info("scheduling with batch") tasks = read_downloaded_tasks() schedule_tasks(tasks) diff --git a/aztk/node_scripts/wait_until_master_selected.py b/aztk/node_scripts/wait_until_master_selected.py index 2ecd6864..ba06b54c 100644 --- a/aztk/node_scripts/wait_until_master_selected.py +++ b/aztk/node_scripts/wait_until_master_selected.py @@ -6,12 +6,11 @@ def main(): while master is None: try: - from aztk.node_scripts.core import config - from aztk.node_scripts.install.pick_master import get_master_node_id + from aztk.node_scripts.core import config, utils batch_client = config.batch_client pool = batch_client.pool.get(config.pool_id) - master = get_master_node_id(pool) + master = utils.get_master_node_id(pool) time.sleep(1) except Exception as e: diff --git a/aztk/spark/client/cluster/helpers/delete.py b/aztk/spark/client/cluster/helpers/delete.py index be3a775c..436e28f3 100644 --- a/aztk/spark/client/cluster/helpers/delete.py +++ b/aztk/spark/client/cluster/helpers/delete.py @@ -3,4 +3,4 @@ def delete_cluster(core_cluster_operations, cluster_id: str, keep_logs: bool = False): with batch_error_manager(): - return core_cluster_operations.delete(cluster_id, keep_logs) + return core_cluster_operations.delete_batch_resources(cluster_id, keep_logs) diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py index 4d969faa..10e6c91b 100644 --- a/aztk/spark/client/cluster/helpers/submit.py +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -15,7 +15,7 @@ def affinitize_task_to_master(core_cluster_operations, spark_cluster_operations, if cluster.master_node_id is None: raise AztkError("Master has not yet been selected. Please wait until the cluster is finished provisioning.") master_node = core_cluster_operations.batch_client.compute_node.get( - pool_id=cluster_id, node_id=cluster.master_node_id) + pool_id=cluster.pool.id, node_id=cluster.master_node_id) task.affinity_info = batch_models.AffinityInformation(affinity_id=master_node.affinity_id) return task diff --git a/aztk/spark/client/job/helpers/delete.py b/aztk/spark/client/job/helpers/delete.py index 84b88982..1131920b 100644 --- a/aztk/spark/client/job/helpers/delete.py +++ b/aztk/spark/client/job/helpers/delete.py @@ -1,34 +1,6 @@ -from azure.batch.models import BatchErrorException -from msrest.exceptions import ClientRequestError +from aztk.utils import batch_error_manager -from aztk import error -from aztk.utils import BackOffPolicy, helpers, retry - -def _delete(core_job_operations, spark_job_operations, job_id, keep_logs: bool = False): - deleted_job = False - - # delete batch job - try: - core_job_operations.batch_client.job.delete(job_id) - deleted_job = True - except BatchErrorException: - pass - - # delete storage container - if keep_logs: - cluster_data = core_job_operations.get_cluster_data(job_id) - cluster_data.delete_container(job_id) - - table_exists = core_job_operations.table_service.exists(job_id) - if table_exists: - core_job_operations.delete_task_table(job_id) - - return deleted_job - - -@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) -def delete(core_job_operations, spark_job_operations, job_id: str, keep_logs: bool = False): - from aztk.utils import batch_error_manager +def delete(core_job_operations, job_id: str, keep_logs: bool = False): with batch_error_manager(): - return _delete(core_job_operations, spark_job_operations, job_id, keep_logs) + return core_job_operations.delete_batch_resources(job_id, keep_logs) diff --git a/aztk/spark/client/job/helpers/list_applications.py b/aztk/spark/client/job/helpers/list_applications.py index aaab7833..4b579e9c 100644 --- a/aztk/spark/client/job/helpers/list_applications.py +++ b/aztk/spark/client/job/helpers/list_applications.py @@ -1,4 +1,5 @@ from aztk.spark import models +from aztk.utils import batch_error_manager def _list_applications(core_job_operations, job_id): @@ -22,7 +23,6 @@ def _list_applications(core_job_operations, job_id): # currently, it returns a dictionary indicating whether # a task has been scheduled or not def list_applications(core_job_operations, job_id): - from aztk.utils import batch_error_manager with batch_error_manager(): applications = _list_applications(core_job_operations, job_id) for item in applications: diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index 861229f6..4b12126f 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -35,7 +35,7 @@ def delete(self, id, keep_logs: bool = False): Returns: :obj:`bool`: True if the deletion process was successful. """ - return delete.delete(self._core_job_operations, self, id, keep_logs) + return delete.delete(self._core_job_operations, id, keep_logs) def get(self, id): """Get details about the state of a job. diff --git a/aztk_cli/entrypoint.py b/aztk_cli/entrypoint.py index c1588587..ec24ccf8 100644 --- a/aztk_cli/entrypoint.py +++ b/aztk_cli/entrypoint.py @@ -5,6 +5,7 @@ pip install -e . """ import argparse +import traceback import warnings from typing import NamedTuple @@ -44,8 +45,10 @@ def main(): run_software(args) except BatchErrorException as e: utils.print_batch_exception(e) + log.debug(traceback.format_exc()) except aztk.error.AztkError as e: log.error(str(e)) + log.debug(traceback.format_exc()) def setup_common_args(parser: argparse.ArgumentParser): diff --git a/aztk_cli/utils.py b/aztk_cli/utils.py index e3d6521b..40d16682 100644 --- a/aztk_cli/utils.py +++ b/aztk_cli/utils.py @@ -237,7 +237,7 @@ def print_batch_exception(batch_exception): for mesg in batch_exception.error.values: log.error("%s:\t%s", mesg.key, mesg.value) log.error("-------------------------------------------") - + def print_jobs(jobs: List[models.Job]): print_format = "{:<34}| {:<10}| {:<20}" From cc2e072def798052ecd27bc1f88722381d45adaa Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 8 Nov 2018 10:58:30 -0800 Subject: [PATCH 19/28] remove some debug code, add Active application state --- aztk/client/job/helpers/submit.py | 2 +- aztk/node_scripts/scheduling/job_submission.py | 2 -- aztk/node_scripts/scheduling/submit.py | 4 ++-- aztk/spark/models/models.py | 1 + 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/aztk/client/job/helpers/submit.py b/aztk/client/job/helpers/submit.py index c1ec63df..71ba29db 100644 --- a/aztk/client/job/helpers/submit.py +++ b/aztk/client/job/helpers/submit.py @@ -35,7 +35,7 @@ def submit_job( job_manager_task=job_manager_task, vm_size=job_configuration.vm_size, vm_image_model=vm_image_model, - on_all_tasks_complete=batch_models.OnAllTasksComplete.no_action, + on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, mixed_mode=job_configuration.mixed_mode, software_metadata_key=software_metadata_key, mode_metadata_key=constants.AZTK_JOB_MODE_METADATA, diff --git a/aztk/node_scripts/scheduling/job_submission.py b/aztk/node_scripts/scheduling/job_submission.py index ddcb804d..4c604c64 100644 --- a/aztk/node_scripts/scheduling/job_submission.py +++ b/aztk/node_scripts/scheduling/job_submission.py @@ -76,8 +76,6 @@ def wait_until_tasks_complete(id): applications = config.spark_client.job.list_applications(id=id) for application_id in applications: if not applications[application_id]: - log.info("not all applications completed: %s", application_id) - print("not all applications completed: {}".format(application_id)) time.sleep(3) config.spark_client.job.list_applications(id=id) break diff --git a/aztk/node_scripts/scheduling/submit.py b/aztk/node_scripts/scheduling/submit.py index 865a6239..740a9c45 100644 --- a/aztk/node_scripts/scheduling/submit.py +++ b/aztk/node_scripts/scheduling/submit.py @@ -82,10 +82,10 @@ def ssh_submit(task_sas_url): task = scheduling_target.insert_task_into_task_table(aztk_cluster_id, task_definition) # run task and upload log exit_code = common.run_command(config.spark_client, cmd.to_str(), application.name) - log("completed application, updating storage table") + log.info("completed application, updating storage table") scheduling_target.mark_task_complete(aztk_cluster_id, task.id, exit_code) except Exception as e: - log("application failed, updating storage table") + log.info("application failed, updating storage table") import traceback scheduling_target.mark_task_failure(aztk_cluster_id, task_definition.id, exit_code, traceback.format_exc()) diff --git a/aztk/spark/models/models.py b/aztk/spark/models/models.py index 0db7c9c0..410be81c 100644 --- a/aztk/spark/models/models.py +++ b/aztk/spark/models/models.py @@ -154,6 +154,7 @@ def __init__( class ApplicationState(Enum): + Active = "active" #TODO Running = "running" Completed = "completed" Failed = "failed" From 87c4deafa18fee8bec3c0486433587265a93e3bf Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 8 Nov 2018 17:05:23 -0800 Subject: [PATCH 20/28] fix calls to use cluster_id instead of pool_id --- aztk/client/base/base_operations.py | 15 ++++++- aztk/client/base/helpers/get_node.py | 7 ++++ .../base/helpers/get_remote_login_settings.py | 11 ++--- aztk/client/base/helpers/get_task.py | 2 +- aztk/client/base/helpers/node_run.py | 2 +- aztk/client/base/helpers/run.py | 2 +- aztk/client/cluster/helpers/copy.py | 2 +- aztk/client/cluster/helpers/get.py | 2 +- aztk/client/cluster/helpers/list.py | 17 +++++--- aztk/models/cluster.py | 4 +- aztk/spark/client/cluster/helpers/create.py | 4 +- .../client/job/helpers/get_application_log.py | 2 +- aztk/spark/client/job/helpers/submit.py | 5 +-- aztk/spark/models/models.py | 4 +- aztk/spark/utils/util.py | 41 ------------------- aztk/spark/utils/wait_for_master.py | 36 ++++++++++++++++ aztk/utils/batch_error_manager.py | 8 +++- aztk/utils/helpers.py | 14 ------- .../spark/sdk/clean_up_cluster.py | 19 ++------- .../spark/sdk/cluster/test_cluster.py | 7 ++-- .../integration_tests/spark/sdk/get_client.py | 2 +- 21 files changed, 101 insertions(+), 105 deletions(-) create mode 100644 aztk/client/base/helpers/get_node.py delete mode 100644 aztk/spark/utils/util.py create mode 100644 aztk/spark/utils/wait_for_master.py diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index f500f1df..393dd999 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -3,8 +3,8 @@ from .helpers import (create_batch_resources, create_user_on_cluster, create_user_on_node, delete_batch_resources, delete_user_on_cluster, delete_user_on_node, generate_user_on_cluster, generate_user_on_node, - get_application_log, get_remote_login_settings, get_task, get_task_state, list_tasks, node_run, - run, ssh_into_node, task_table) + get_application_log, get_node, get_remote_login_settings, get_task, get_task_state, list_tasks, + node_run, run, ssh_into_node, task_table) class BaseOperations: @@ -351,3 +351,14 @@ def get_task(self, id: str, task_id: str): :obj:`[aztk.models.Task]`: the submitted task with id task_id """ return get_task.get_task(self, id, task_id) + + def get_node(self, id: str, node_id: str): + """Get a node in a cluster + Args: + id (:obj:`str`): the id of the cluster + node_id (:obj:`str`): the id of the node + + Returns: + :obj:`[azure.batch.models.ComputeNode]`: the requested node + """ + return get_node.get_node(self, id, node_id) diff --git a/aztk/client/base/helpers/get_node.py b/aztk/client/base/helpers/get_node.py new file mode 100644 index 00000000..a95692d5 --- /dev/null +++ b/aztk/client/base/helpers/get_node.py @@ -0,0 +1,7 @@ +from aztk.utils import batch_error_manager + + +def get_node(core_base_operations, cluster_id, node_id): + with batch_error_manager(): + cluster = core_base_operations.get(cluster_id) + return core_base_operations.batch_client.compute_node.get(cluster.pool.id, node_id) diff --git a/aztk/client/base/helpers/get_remote_login_settings.py b/aztk/client/base/helpers/get_remote_login_settings.py index f24e2004..21baae93 100644 --- a/aztk/client/base/helpers/get_remote_login_settings.py +++ b/aztk/client/base/helpers/get_remote_login_settings.py @@ -2,17 +2,18 @@ from aztk.utils import batch_error_manager -def _get_remote_login_settings(base_client, pool_id: str, node_id: str): +def _get_remote_login_settings(core_base_operations, cluster_id: str, node_id: str): """ Get the remote_login_settings for node - :param pool_id + :param cluster_id :param node_id :returns aztk.models.RemoteLogin """ - result = base_client.batch_client.compute_node.get_remote_login_settings(pool_id, node_id) + cluster = core_base_operations.get(cluster_id) + result = core_base_operations.batch_client.compute_node.get_remote_login_settings(cluster.pool.id, node_id) return models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) -def get_remote_login_settings(base_client, cluster_id: str, node_id: str): +def get_remote_login_settings(core_base_operations, cluster_id: str, node_id: str): with batch_error_manager(): - return _get_remote_login_settings(base_client, cluster_id, node_id) + return _get_remote_login_settings(core_base_operations, cluster_id, node_id) diff --git a/aztk/client/base/helpers/get_task.py b/aztk/client/base/helpers/get_task.py index a14a701f..bb1efbd8 100644 --- a/aztk/client/base/helpers/get_task.py +++ b/aztk/client/base/helpers/get_task.py @@ -4,7 +4,7 @@ def get_task(core_base_operations, id, task_id): - """List all tasks on a job or cluster + """Get a task submitted to a job or cluster This will work for both Batch scheduling and scheduling_target diff --git a/aztk/client/base/helpers/node_run.py b/aztk/client/base/helpers/node_run.py index b8c92bfa..e9bb1216 100644 --- a/aztk/client/base/helpers/node_run.py +++ b/aztk/client/base/helpers/node_run.py @@ -11,7 +11,7 @@ def node_run(base_client, cluster_id, node_id, command, internal, container_name if internal: node_rls = models.RemoteLogin(ip_address=node.ip_address, port="22") else: - node_rls = base_client.get_remote_login_settings(cluster.pool.id, node.id) + node_rls = base_client.get_remote_login_settings(cluster_id, node.id) try: generated_username, ssh_key = base_client.generate_user_on_node(cluster.pool.id, node.id) output = ssh_lib.node_exec_command( diff --git a/aztk/client/base/helpers/run.py b/aztk/client/base/helpers/run.py index 3157e26b..f234b70a 100644 --- a/aztk/client/base/helpers/run.py +++ b/aztk/client/base/helpers/run.py @@ -11,7 +11,7 @@ def cluster_run(base_operations, cluster_id, command, internal, container_name=N if internal: cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: - cluster_nodes = [(node, base_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] + cluster_nodes = [(node, base_operations.get_remote_login_settings(cluster_id, node.id)) for node in nodes] with batch_error_manager(): generated_username, ssh_key = base_operations.generate_user_on_cluster(pool.id, nodes) diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py index 7b1dc595..31f3e47c 100644 --- a/aztk/client/cluster/helpers/copy.py +++ b/aztk/client/cluster/helpers/copy.py @@ -22,7 +22,7 @@ def cluster_copy( if internal: cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: - cluster_nodes = [(node, cluster_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] + cluster_nodes = [(node, cluster_operations.get_remote_login_settings(cluster_id, node.id)) for node in nodes] with batch_error_manager(): diff --git a/aztk/client/cluster/helpers/get.py b/aztk/client/cluster/helpers/get.py index 22cb26ad..d8762921 100644 --- a/aztk/client/cluster/helpers/get.py +++ b/aztk/client/cluster/helpers/get.py @@ -16,4 +16,4 @@ def get_pool_details(core_cluster_operations, cluster_id: str): pool_id = convert_job_id_to_pool_id(core_cluster_operations.batch_client, cluster_id) pool = core_cluster_operations.batch_client.pool.get(pool_id) nodes = core_cluster_operations.batch_client.compute_node.list(pool_id=pool_id) - return models.Cluster(pool, nodes) + return models.Cluster(cluster_id, pool, nodes) diff --git a/aztk/client/cluster/helpers/list.py b/aztk/client/cluster/helpers/list.py index b439d93e..b1e06731 100644 --- a/aztk/client/cluster/helpers/list.py +++ b/aztk/client/cluster/helpers/list.py @@ -2,17 +2,22 @@ from aztk.utils import constants -def list_clusters(cluster_client, software_metadata_key): +def job_get_pool(core_cluster_operations, job): + if job.execution_info and job.execution_info.pool_id: + return core_cluster_operations.batch_client.pool.get(job.execution_info.pool_id) + +def list_clusters(core_cluster_operations, software_metadata_key): """ List all the cluster on your account. """ - pools = cluster_client.batch_client.pool.list() + jobs = core_cluster_operations.batch_client.job.list() software_metadata = (constants.AZTK_SOFTWARE_METADATA_KEY, software_metadata_key) cluster_metadata = (constants.AZTK_MODE_METADATA_KEY, constants.AZTK_CLUSTER_MODE_METADATA) aztk_clusters = [] - for pool in [pool for pool in pools if pool.metadata]: - pool_metadata = [(metadata.name, metadata.value) for metadata in pool.metadata] - if all([metadata in pool_metadata for metadata in [software_metadata, cluster_metadata]]): - aztk_clusters.append(models.Cluster(pool)) + for job in jobs: + if job.metadata: + job_metadata = [(metadata.name, metadata.value) for metadata in job.metadata] + if all([metadata in job_metadata for metadata in [software_metadata, cluster_metadata]]): + aztk_clusters.append(models.Cluster(job.id, job_get_pool(core_cluster_operations, job))) return aztk_clusters diff --git a/aztk/models/cluster.py b/aztk/models/cluster.py index b4699508..120a7147 100644 --- a/aztk/models/cluster.py +++ b/aztk/models/cluster.py @@ -4,8 +4,8 @@ class Cluster: - def __init__(self, pool: batch_models.CloudPool, nodes: batch_models.ComputeNodePaged = None): - self.id = pool.id + def __init__(self, id, pool: batch_models.CloudPool, nodes: batch_models.ComputeNodePaged = None): + self.id = id self.pool = pool self.nodes = nodes self.vm_size = pool.vm_size diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index da204f0e..605fb3ae 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -3,7 +3,7 @@ from aztk import models as base_models from aztk.internal.cluster_data import NodeData from aztk.spark import models -from aztk.spark.utils import constants, util +from aztk.spark.utils import constants, wait_for_master from aztk.utils import batch_error_manager POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity( @@ -65,7 +65,7 @@ def create_cluster(core_cluster_operations, # Wait for the master to be ready if wait: - util.wait_for_master_to_be_ready(core_cluster_operations, spark_cluster_operations, cluster.id) + wait_for_master.wait_for_master(core_cluster_operations, spark_cluster_operations, cluster.id) cluster = spark_cluster_operations.get(cluster.id) return cluster diff --git a/aztk/spark/client/job/helpers/get_application_log.py b/aztk/spark/client/job/helpers/get_application_log.py index 538749e2..faba7cd9 100644 --- a/aztk/spark/client/job/helpers/get_application_log.py +++ b/aztk/spark/client/job/helpers/get_application_log.py @@ -11,7 +11,7 @@ def _get_application_log(core_job_operations, spark_job_operations, job_id, appl return core_job_operations.get_application_log(job_id, application_name) try: - task = core_job_operations.get_batch_task(id=job_id, task_id=application_name) + task = core_job_operations.get_task(id=job_id, task_id=application_name) except batch_models.BatchErrorException as e: # task may not exist since it may not yet be scheduled # see if the task is written to metadata of pool diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index 084ad1c2..781181cb 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -1,6 +1,7 @@ import azure.batch.models as batch_models import yaml +import aztk.spark.utils.constants from aztk import models as base_models from aztk.internal.cluster_data import NodeData from aztk.spark import models @@ -105,8 +106,6 @@ def submit_job(core_job_operations, software_metadata_key = base_models.Software.spark - vm_image = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04") - autoscale_formula = "$TargetDedicatedNodes = {0}; " "$TargetLowPriorityNodes = {1}".format( job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) @@ -116,7 +115,7 @@ def submit_job(core_job_operations, job_manager_task=job_manager_task, autoscale_formula=autoscale_formula, software_metadata_key=software_metadata_key, - vm_image_model=vm_image, + vm_image_model=aztk.spark.utils.constants.SPARK_VM_IMAGE, application_metadata="\n".join(application.name for application in (job_configuration.applications or [])), ) diff --git a/aztk/spark/models/models.py b/aztk/spark/models/models.py index 410be81c..91bf4646 100644 --- a/aztk/spark/models/models.py +++ b/aztk/spark/models/models.py @@ -19,7 +19,7 @@ def __init__(self, version: str, environment: str = None, environment_version: s class Cluster(aztk.models.Cluster): def __init__(self, cluster: aztk.models.Cluster): - super().__init__(cluster.pool, cluster.nodes) + super().__init__(cluster.id, cluster.pool, cluster.nodes) self.master_node_id = self.__get_master_node_id() self.gpu_enabled = helpers.is_gpu_enabled(cluster.pool.vm_size) @@ -274,7 +274,7 @@ def __init__( self.creation_time = cloud_job.creation_time self.applications = [Application(task) for task in (tasks or [])] if pool: - self.cluster = Cluster(aztk.models.Cluster(pool, nodes)) + self.cluster = Cluster(aztk.models.Cluster(cloud_job.id, pool, nodes)) else: self.cluster = None diff --git a/aztk/spark/utils/util.py b/aztk/spark/utils/util.py deleted file mode 100644 index ec03be92..00000000 --- a/aztk/spark/utils/util.py +++ /dev/null @@ -1,41 +0,0 @@ -from __future__ import print_function - -import datetime -import time - -import azure.batch.models as batch_models - -from aztk.utils import constants - - -class MasterInvalidStateError(Exception): - pass - - -def wait_for_master_to_be_ready(core_operations, spark_operations, cluster_id: str): - - master_node_id = None - start_time = datetime.datetime.now() - while True: - if not master_node_id: - master_node_id = spark_operations.get(cluster_id).master_node_id - if not master_node_id: - time.sleep(5) - continue - - master_node = core_operations.batch_client.compute_node.get(cluster_id, master_node_id) - - if master_node.state in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: - break - elif master_node.state is batch_models.ComputeNodeState.start_task_failed: - raise MasterInvalidStateError("Start task failed on master") - elif master_node.state in [batch_models.ComputeNodeState.unknown, batch_models.ComputeNodeState.unusable]: - raise MasterInvalidStateError("Master is in an invalid state") - else: - now = datetime.datetime.now() - - delta = now - start_time - if delta.total_seconds() > constants.WAIT_FOR_MASTER_TIMEOUT: - raise MasterInvalidStateError("Master didn't become ready before timeout.") - - time.sleep(10) diff --git a/aztk/spark/utils/wait_for_master.py b/aztk/spark/utils/wait_for_master.py new file mode 100644 index 00000000..5c4cd5bb --- /dev/null +++ b/aztk/spark/utils/wait_for_master.py @@ -0,0 +1,36 @@ +from __future__ import print_function + +import datetime +import time + +import azure.batch.models as batch_models + +from aztk.utils import constants + + +class MasterInvalidStateError(Exception): + pass + + +def wait_for_master(core_operations, spark_operations, cluster_id: str): + cluster = None + master_node = None + start_time = datetime.datetime.now() + while True: + delta = datetime.datetime.now() - start_time + if delta.total_seconds() > constants.WAIT_FOR_MASTER_TIMEOUT: + raise MasterInvalidStateError("Master didn't become ready before timeout.") + + cluster = spark_operations.get(cluster_id) + + if cluster.master_node_id: + master_node = core_operations.get_node(cluster_id, cluster.master_node_id) + if master_node: + if master_node.state in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: + break + if master_node.state is batch_models.ComputeNodeState.start_task_failed: + raise MasterInvalidStateError("Start task failed on master") + elif master_node.state in [batch_models.ComputeNodeState.unknown, batch_models.ComputeNodeState.unusable]: + raise MasterInvalidStateError("Master is in an invalid state") + + time.sleep(5) diff --git a/aztk/utils/batch_error_manager.py b/aztk/utils/batch_error_manager.py index 3ae0793f..d2732d0d 100644 --- a/aztk/utils/batch_error_manager.py +++ b/aztk/utils/batch_error_manager.py @@ -1,14 +1,18 @@ +import traceback from contextlib import contextmanager from azure.batch.models import BatchErrorException from aztk import error -from aztk.utils import constants, helpers +from aztk.utils import helpers @contextmanager -def batch_error_manager(): +def batch_error_manager(verbose=False): try: yield except BatchErrorException as e: + if verbose: + # TODO: change to log.debug + print(traceback.format_exc()) raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/utils/helpers.py b/aztk/utils/helpers.py index 184d08ac..56ae31de 100644 --- a/aztk/utils/helpers.py +++ b/aztk/utils/helpers.py @@ -259,20 +259,6 @@ def wrap_commands_in_shell(commands): return "/bin/bash -c 'set -e -o pipefail; {};'".format(";".join(commands)) -def get_connection_info(pool_id, node_id, batch_client): - """ - Get connection info of specified node in pool - :param batch_client: The batch client to use. - :type batch_client: `batchserviceclient.BatchServiceClient` - :param str pool_id: The pool id to look up - :param str node_id: The node id to look up - """ - rls = batch_client.compute_node.get_remote_login_settings(pool_id, node_id) - remote_ip = rls.remote_login_ip_address - ssh_port = str(rls.remote_login_port) - return (remote_ip, ssh_port) - - def get_cluster_total_target_nodes(pool): """ Get the total number of target nodes (dedicated + low pri) for the pool diff --git a/tests/integration_tests/spark/sdk/clean_up_cluster.py b/tests/integration_tests/spark/sdk/clean_up_cluster.py index 275028e0..5f48f49a 100644 --- a/tests/integration_tests/spark/sdk/clean_up_cluster.py +++ b/tests/integration_tests/spark/sdk/clean_up_cluster.py @@ -1,5 +1,5 @@ import azure.batch.models as batch_models -from azure.batch.models import BatchErrorException +from azure.batch.models import BatchErrorException, ComputeNodeState from aztk.error import AztkError @@ -8,19 +8,8 @@ def clean_up_cluster(spark_client, id): try: cluster = spark_client.cluster.get(id) nodes = [node for node in cluster.nodes] - if not any([ - node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed] - for node in nodes - ]): + dont_delete_states = [ComputeNodeState.unusable, ComputeNodeState.start_task_failed] + if not any([node.state in dont_delete_states for node in nodes]): spark_client.cluster.delete(id=id) except (BatchErrorException, AztkError) as e: - # pass in the event that the cluster does not exist - print(str(e)) - acceptable_failures = [ - "The specified job has been marked for deletion and is being garbage collected.", - "The specified pool has been marked for deletion and is being reclaimed." - ] - if any(item in str(e) for item in acceptable_failures): - pass - else: - raise e + pass diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster.py b/tests/integration_tests/spark/sdk/cluster/test_cluster.py index bc21cdba..f9b47a68 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster.py @@ -73,7 +73,7 @@ def test_list_clusters(): def test_get_remote_login_settings(): - test_id = "get-remote-login-" + test_id = "remote-login-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -187,7 +187,7 @@ def test_create_user_ssh_key(): def test_get_application_state_complete(): - test_id = "app-status-complete-" + test_id = "app-status-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -214,7 +214,6 @@ def test_get_application_state_complete(): max_retry_count=None) try: spark_client.cluster.create(cluster_configuration, wait=True) - spark_client.cluster.submit( id=cluster_configuration.cluster_id, application=application_configuration, wait=True) state = spark_client.cluster.get_application_state( @@ -249,7 +248,7 @@ def test_delete_cluster(): def test_spark_processes_up(): - test_id = "spark-processes-up-" + test_id = "spark-up-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, diff --git a/tests/integration_tests/spark/sdk/get_client.py b/tests/integration_tests/spark/sdk/get_client.py index 1559e638..a39451c3 100644 --- a/tests/integration_tests/spark/sdk/get_client.py +++ b/tests/integration_tests/spark/sdk/get_client.py @@ -42,6 +42,6 @@ def get_spark_client(): def get_test_suffix(prefix: str): # base cluster name dt = datetime.now() - current_time = dt.microsecond + current_time = str(dt.microsecond)[:4] base_cluster_id = "{0}-{1}".format(prefix, current_time) return base_cluster_id From b9c1cb26598c2176ba2b84f4ec22f9692ff71bc0 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 8 Nov 2018 17:49:10 -0800 Subject: [PATCH 21/28] remove unused code, pass correct id, add base job get stub --- .../base/helpers/get_application_log.py | 4 - aztk/client/job/helpers/get.py | 1 + .../node_scripts/scheduling/job_submission.py | 8 +- aztk/utils/helpers.py | 88 ------------------- 4 files changed, 5 insertions(+), 96 deletions(-) create mode 100644 aztk/client/job/helpers/get.py diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index 3174f027..a3b13682 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -47,7 +47,6 @@ def download_callback(current, total): stream.seek(0) return blob except azure.common.AzureMissingResourceHttpError: - raise raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") except azure.common.AzureHttpError as e: if e.error_code in ["InvalidRange"]: @@ -116,15 +115,12 @@ def stream_log_from_storage(base_operations, container_name, application_name, t def get_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - cluster_configuration = base_operations.get_cluster_configuration(cluster_id) task = wait_for_task(base_operations, cluster_id, application_name) return get_log_from_storage(base_operations.blob_client, cluster_id, application_name, task, current_bytes) def stream_log(base_operations, cluster_id: str, application_name: str): - print("running stream_log") - cluster_configuration = base_operations.get_cluster_configuration(cluster_id) task = wait_for_task(base_operations, cluster_id, application_name) return stream_log_from_storage(base_operations, cluster_id, application_name, task) diff --git a/aztk/client/job/helpers/get.py b/aztk/client/job/helpers/get.py new file mode 100644 index 00000000..ebe9f1c4 --- /dev/null +++ b/aztk/client/job/helpers/get.py @@ -0,0 +1 @@ +# TODO: pass \ No newline at end of file diff --git a/aztk/node_scripts/scheduling/job_submission.py b/aztk/node_scripts/scheduling/job_submission.py index 4c604c64..3556618b 100644 --- a/aztk/node_scripts/scheduling/job_submission.py +++ b/aztk/node_scripts/scheduling/job_submission.py @@ -27,9 +27,9 @@ def read_downloaded_tasks(): return tasks -def affinitize_task_to_master(batch_client, cluster_id, task): - cluster = config.spark_client.cluster.get_cluster(id=cluster_id) - master_node = batch_client.compute_node.get(pool_id=cluster_id, node_id=cluster.master_node_id) +def affinitize_task_to_master(cluster_id, task): + cluster = config.spark_client.cluster.get(id=cluster_id) + master_node = master_node = config.batch_client.compute_node.get(id=cluster.pool.id, node_id=cluster.master_node_id) task.affinity_info = batch_models.AffinityInformation(affinity_id=master_node.affinity_id) return task @@ -39,7 +39,7 @@ def schedule_tasks(tasks): Handle the request to submit a task """ for task in tasks: - task = affinitize_task_to_master(config.batch_client, config.cluster_id, task) + task = affinitize_task_to_master(config.cluster_id, task) config.batch_client.task.add(job_id=config.job_id, task=task) diff --git a/aztk/utils/helpers.py b/aztk/utils/helpers.py index 56ae31de..961dd190 100644 --- a/aztk/utils/helpers.py +++ b/aztk/utils/helpers.py @@ -123,25 +123,6 @@ def upload_file_to_container(container_name, return batch_models.ResourceFile(file_path=node_path, blob_source=sas_url) -def create_pool_if_not_exist(pool, batch_client): - """ - Creates the specified pool if it doesn't already exist - :param batch_client: The batch client to use. - :type batch_client: `batchserviceclient.BatchServiceClient` - :param pool: The pool to create. - :type pool: `batchserviceclient.models.PoolAddParameter` - """ - try: - batch_client.pool.add(pool) - except batch_models.BatchErrorException as e: - if e.error.code == "PoolExists": - raise error.AztkError( - "A cluster with the same id already exists. Use a different id or delete the existing cluster") - else: - raise - return True - - def wait_for_all_nodes_state(pool, node_state, batch_client): """ Waits for all nodes in pool to reach any specified state in set @@ -193,61 +174,6 @@ def select_latest_verified_vm_image_with_node_agent_sku(publisher, offer, sku_st return (sku_to_use.id, image_ref_to_use) -def create_sas_token(container_name, blob_name, permission, blob_client, expiry=None, timeout=None): - """ - Create a blob sas token - :param blob_client: The storage block blob client to use. - :type blob_client: `azure.storage.blob.BlockBlobService` - :param str container_name: The name of the container to upload the blob to. - :param str blob_name: The name of the blob to upload the local file to. - :param expiry: The SAS expiry time. - :type expiry: `datetime.datetime` - :param int timeout: timeout in minutes from now for expiry, - will only be used if expiry is not specified - :return: A SAS token - :rtype: str - """ - if expiry is None: - if timeout is None: - timeout = 30 - expiry = datetime.datetime.utcnow() + datetime.timedelta(minutes=timeout) - return blob_client.generate_blob_shared_access_signature( - container_name, blob_name, permission=permission, expiry=expiry) - - -def upload_blob_and_create_sas(container_name, blob_name, file_name, expiry, blob_client, timeout=None): - """ - Uploads a file from local disk to Azure Storage and creates a SAS for it. - :param blob_client: The storage block blob client to use. - :type blob_client: `azure.storage.blob.BlockBlobService` - :param str container_name: The name of the container to upload the blob to. - :param str blob_name: The name of the blob to upload the local file to. - :param str file_name: The name of the local file to upload. - :param expiry: The SAS expiry time. - :type expiry: `datetime.datetime` - :param int timeout: timeout in minutes from now for expiry, - will only be used if expiry is not specified - :return: A SAS URL to the blob with the specified expiry time. - :rtype: str - """ - blob_client.create_container(container_name, fail_on_exist=False) - - blob_client.create_blob_from_path(container_name, blob_name, file_name) - - sas_token = create_sas_token( - container_name, - blob_name, - permission=blob.BlobPermissions.READ, - blob_client=None, - expiry=expiry, - timeout=timeout, - ) - - sas_url = blob_client.make_blob_url(container_name, blob_name, sas_token=sas_token) - - return sas_url - - def wrap_commands_in_shell(commands): """ Wrap commands in a shell @@ -259,20 +185,6 @@ def wrap_commands_in_shell(commands): return "/bin/bash -c 'set -e -o pipefail; {};'".format(";".join(commands)) -def get_cluster_total_target_nodes(pool): - """ - Get the total number of target nodes (dedicated + low pri) for the pool - """ - return pool.target_dedicated_nodes + pool.target_low_priority_nodes - - -def get_cluster_total_current_nodes(pool): - """ - Get the total number of current nodes (dedicated + low pri) in the pool - """ - return pool.current_dedicated_nodes + pool.current_low_priority_nodes - - def normalize_path(path: str) -> str: """ Convert a path in a path that will work well with blob storage and unix From 57a15474beeef0b51f5450d79f94d5b9c18b7ba8 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 9 Nov 2018 12:56:35 -0800 Subject: [PATCH 22/28] fix some issues with job submission --- aztk/node_scripts/install/install.py | 2 +- aztk/node_scripts/scheduling/job_submission.py | 5 +++-- aztk/spark/client/job/helpers/submit.py | 2 +- docs/14-azure-files.md | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/aztk/node_scripts/install/install.py b/aztk/node_scripts/install/install.py index 017889ea..ca262934 100644 --- a/aztk/node_scripts/install/install.py +++ b/aztk/node_scripts/install/install.py @@ -33,7 +33,7 @@ def setup_host(docker_repo: str, docker_run_options: str): master_node = config.batch_client.compute_node.get(config.pool_id, cluster.master_node_id) os.environ["AZTK_IS_MASTER"] = "true" if is_master else "false" - os.environ["AZTK_IS_WORKER"] = "true" if is_worker else "true" + os.environ["AZTK_IS_WORKER"] = "true" if is_worker else "false" os.environ["AZTK_MASTER_IP"] = master_node.ip_address diff --git a/aztk/node_scripts/scheduling/job_submission.py b/aztk/node_scripts/scheduling/job_submission.py index 3556618b..d528a7ac 100644 --- a/aztk/node_scripts/scheduling/job_submission.py +++ b/aztk/node_scripts/scheduling/job_submission.py @@ -29,7 +29,8 @@ def read_downloaded_tasks(): def affinitize_task_to_master(cluster_id, task): cluster = config.spark_client.cluster.get(id=cluster_id) - master_node = master_node = config.batch_client.compute_node.get(id=cluster.pool.id, node_id=cluster.master_node_id) + master_node = master_node = config.batch_client.compute_node.get( + pool_id=cluster.pool.id, node_id=cluster.master_node_id) task.affinity_info = batch_models.AffinityInformation(affinity_id=master_node.affinity_id) return task @@ -40,7 +41,7 @@ def schedule_tasks(tasks): """ for task in tasks: task = affinitize_task_to_master(config.cluster_id, task) - config.batch_client.task.add(job_id=config.job_id, task=task) + config.batch_client.task.add(job_id=config.cluster_id, task=task) def select_scheduling_target_node(spark_cluster_operations, cluster_id, scheduling_target): diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index 781181cb..6a00843f 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -53,7 +53,7 @@ def generate_job_manager_task(core_job_operations, job, application_tasks): command_line=helpers.wrap_commands_in_shell([task_cmd]), resource_files=resource_files, kill_job_on_completion=False, - allow_low_priority_node=True, + allow_low_priority_node=True, #TODO: false unless job only has low priority user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)), diff --git a/docs/14-azure-files.md b/docs/14-azure-files.md index cd081f07..3ffd539b 100644 --- a/docs/14-azure-files.md +++ b/docs/14-azure-files.md @@ -5,7 +5,7 @@ The ability to load a file share on the cluster is really useful when you want t Mounting an Azure Files share in the cluster only required updating the cluster.yaml file at `.aztk/cluster.yaml`. For example, the following configuration will load two files shares into the cluster, one with my notebooks and one will a small data set that I have previously uploaded to Azure Files. ```yaml -azure_files: +file_shares: - storage_account_name: STORAGE_ACCOUNT_NAME storage_account_key: STORAGE_ACCOUNT_KEY # Name of the file share in Azure Files From 123f5b4eaf5e2bc3f8b66f6c5a18e8714097db0d Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 21 Nov 2018 16:37:29 -0800 Subject: [PATCH 23/28] add file service, rename blob_client --- aztk/client/base/base_operations.py | 15 +++-- .../base/helpers/get_application_log.py | 18 +++--- aztk/client/client.py | 12 +++- aztk/internal/cluster_data/blob_data.py | 8 +-- aztk/internal/cluster_data/cluster_data.py | 22 +++---- aztk/node_scripts/core/config.py | 7 ++- aztk/node_scripts/install/install.py | 2 +- aztk/node_scripts/scheduling/common.py | 60 ++++++------------- aztk/node_scripts/scheduling/submit.py | 2 +- .../base/helpers/generate_application_task.py | 10 ++-- aztk/spark/client/cluster/helpers/submit.py | 6 +- aztk/spark/client/job/helpers/submit.py | 2 +- aztk/utils/azure_api.py | 10 ++-- aztk/utils/helpers.py | 41 ++++++------- 14 files changed, 102 insertions(+), 113 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 393dd999..fbffcc3d 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -13,15 +13,22 @@ class BaseOperations: Attributes: batch_client (:obj:`azure.batch.batch_service_client.BatchServiceClient`): Client used to interact with the Azure Batch service. - blob_client (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the Azure Storage - Blob service. + cloud_storage_account (:obj:`azure.storage.blob.CloudStorageAccount`): Azure Storage account used + block_blob_service (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the + Azure Storage Blob service. + file_service (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the Azure Storage + File service. + table_service (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the Azure Storage + Table service. secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): Model that holds AZTK secrets used to authenticate with Azure and the clusters. """ def __init__(self, context): self.batch_client = context["batch_client"] - self.blob_client = context["blob_client"] + self.cloud_storage_account = context["cloud_storage_account"] + self.block_blob_service = context["block_blob_service"] + self.file_service = context["file_service"] self.table_service = context["table_service"] self.secrets_configuration = context["secrets_configuration"] @@ -54,7 +61,7 @@ def get_cluster_data(self, id: str) -> cluster_data.ClusterData: Returns: :obj:`aztk.models.ClusterData`: Object used to manage the data and storage functions for a cluster """ - return cluster_data.ClusterData(self.blob_client, id) + return cluster_data.ClusterData(self.block_blob_service, id) def ssh_into_node(self, id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): """Open an ssh tunnel to a node diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index a3b13682..8bf27fb5 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -25,8 +25,8 @@ def wait_for_task(base_operations, cluster_id, application_name): return task -def get_blob_from_storage(block_blob_client, container_name, application_name, stream, start_range, end_range=None): - print(block_blob_client, container_name, application_name, stream, start_range, end_range) +def get_blob_from_storage(block_blob_service, container_name, application_name, stream, start_range, end_range=None): + print(block_blob_service, container_name, application_name, stream, start_range, end_range) previous = 0 def download_callback(current, total): @@ -37,7 +37,7 @@ def download_callback(current, total): previous = current try: - blob = block_blob_client.get_blob_to_stream( + blob = block_blob_service.get_blob_to_stream( container_name, convert_application_name_to_blob_path(application_name), stream, @@ -55,10 +55,9 @@ def download_callback(current, total): raise -def get_log_from_storage(blob_client, container_name, application_name, task, current_bytes): +def get_log_from_storage(block_blob_service, container_name, application_name, task, current_bytes): stream = tempfile.TemporaryFile() - blob = get_blob_from_storage(blob_client.create_block_blob_service(), container_name, application_name, stream, - current_bytes) + blob = get_blob_from_storage(block_blob_service, container_name, application_name, stream, current_bytes) return models.ApplicationLog( name=application_name, cluster_id=container_name, @@ -80,9 +79,8 @@ def stream_log_from_storage(base_operations, container_name, application_name, t stream = tempfile.TemporaryFile() last_read_byte = 0 - block_blob_client = base_operations.blob_client.create_block_blob_service() blob = get_blob_from_storage( - block_blob_client, + base_operations.block_blob_service, container_name, application_name, stream, @@ -95,7 +93,7 @@ def stream_log_from_storage(base_operations, container_name, application_name, t task = base_operations.get_task(container_name, task.id) last_read_byte = blob.properties.content_length blob = get_blob_from_storage( - block_blob_client, + base_operations.block_blob_service, container_name, application_name, stream, @@ -117,7 +115,7 @@ def stream_log_from_storage(base_operations, container_name, application_name, t def get_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): task = wait_for_task(base_operations, cluster_id, application_name) - return get_log_from_storage(base_operations.blob_client, cluster_id, application_name, task, current_bytes) + return get_log_from_storage(base_operations.block_blob_service, cluster_id, application_name, task, current_bytes) def stream_log(base_operations, cluster_id: str, application_name: str): diff --git a/aztk/client/client.py b/aztk/client/client.py index 49b8740c..57832be6 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -13,7 +13,9 @@ class CoreClient: def __init__(self): self.secrets_configuration = None self.batch_client = None - self.blob_client = None + self.cloud_storage_account = None + self.file_service = None + self.block_blob_service = None self.table_service = None def _get_context(self, secrets_configuration: models.SecretsConfiguration): @@ -21,11 +23,15 @@ def _get_context(self, secrets_configuration: models.SecretsConfiguration): azure_api.validate_secrets(secrets_configuration) self.batch_client = azure_api.make_batch_client(secrets_configuration) - self.blob_client = azure_api.make_blob_client(secrets_configuration) + self.cloud_storage_account = azure_api.make_cloud_storage_account(secrets_configuration) + self.file_service = self.cloud_storage_account.create_file_service() + self.block_blob_service = self.cloud_storage_account.create_block_blob_service() self.table_service = azure_api.make_table_service(secrets_configuration) context = { "batch_client": self.batch_client, - "blob_client": self.blob_client, + "cloud_storage_account": self.cloud_storage_account, + "file_service": self.file_service, + "block_blob_service": self.block_blob_service, "table_service": self.table_service, "secrets_configuration": self.secrets_configuration, } diff --git a/aztk/internal/cluster_data/blob_data.py b/aztk/internal/cluster_data/blob_data.py index 699d8eed..42a1fa53 100644 --- a/aztk/internal/cluster_data/blob_data.py +++ b/aztk/internal/cluster_data/blob_data.py @@ -9,20 +9,20 @@ class BlobData: Object mapping to a blob entry. Can generate resource files for batch """ - def __init__(self, blob_client: BlockBlobService, container: str, blob: str): + def __init__(self, block_blob_service: BlockBlobService, container: str, blob: str): self.container = container self.blob = blob self.dest = blob - self.block_blob_client = blob_client + self.block_blob_service = block_blob_service def to_resource_file(self, dest: str = None) -> batch_models.ResourceFile: - sas_token = self.block_blob_client.generate_blob_shared_access_signature( + sas_token = self.block_blob_service.generate_blob_shared_access_signature( self.container, self.blob, permission=BlobPermissions.READ, expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365), ) - sas_url = self.block_blob_client.make_blob_url(self.container, self.blob, sas_token=sas_token) + sas_url = self.block_blob_service.make_blob_url(self.container, self.blob, sas_token=sas_token) return batch_models.ResourceFile(file_path=dest or self.dest, blob_source=sas_url) diff --git a/aztk/internal/cluster_data/cluster_data.py b/aztk/internal/cluster_data/cluster_data.py index aae5a8de..05f670af 100644 --- a/aztk/internal/cluster_data/cluster_data.py +++ b/aztk/internal/cluster_data/cluster_data.py @@ -3,7 +3,7 @@ import azure.common import yaml -from azure.storage.common import CloudStorageAccount +from azure.storage.blob import BlockBlobService from msrest.exceptions import ClientRequestError from aztk import error @@ -24,8 +24,8 @@ class ClusterData: APPLICATIONS_DIR = "applications" CLUSTER_CONFIG_FILE = "config.yaml" - def __init__(self, blob_client: CloudStorageAccount, cluster_id: str): - self.block_blob_client = blob_client.create_block_blob_service() + def __init__(self, block_blob_service: BlockBlobService, cluster_id: str): + self.block_blob_service = block_blob_service self.cluster_id = cluster_id self._ensure_container() @@ -34,13 +34,13 @@ def save_cluster_config(self, cluster_config): blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE content = yaml.dump(cluster_config) container_name = cluster_config.cluster_id - self.block_blob_client.create_blob_from_text(container_name, blob_path, content) + self.block_blob_service.create_blob_from_text(container_name, blob_path, content) @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def read_cluster_config(self): blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE try: - result = self.block_blob_client.get_blob_to_text(self.cluster_id, blob_path) + result = self.block_blob_service.get_blob_to_text(self.cluster_id, blob_path) return yaml.load(result.content) except azure.common.AzureMissingResourceHttpError: raise error.AztkError("Cluster {} doesn't have cluster configuration in storage".format(self.cluster_id)) @@ -49,13 +49,13 @@ def read_cluster_config(self): @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def upload_file(self, blob_path: str, local_path: str) -> BlobData: - self.block_blob_client.create_blob_from_path(self.cluster_id, blob_path, local_path) - return BlobData(self.block_blob_client, self.cluster_id, blob_path) + self.block_blob_service.create_blob_from_path(self.cluster_id, blob_path, local_path) + return BlobData(self.block_blob_service, self.cluster_id, blob_path) @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def upload_bytes(self, blob_path: str, bytes_io: io.BytesIO) -> BlobData: - self.block_blob_client.create_blob_from_bytes(self.cluster_id, blob_path, bytes_io.getvalue()) - return BlobData(self.block_blob_client, self.cluster_id, blob_path) + self.block_blob_service.create_blob_from_bytes(self.cluster_id, blob_path, bytes_io.getvalue()) + return BlobData(self.block_blob_service, self.cluster_id, blob_path) def upload_cluster_file(self, blob_path: str, local_path: str) -> BlobData: blob_data = self.upload_bytes(self.CLUSTER_DIR + "/" + blob_path, local_path) @@ -72,8 +72,8 @@ def upload_node_data(self, node_data: NodeData) -> BlobData: @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def _ensure_container(self): - self.block_blob_client.create_container(self.cluster_id, fail_on_exist=False) + self.block_blob_service.create_container(self.cluster_id, fail_on_exist=False) @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def delete_container(self, container_name: str): - self.block_blob_client.delete_container(container_name) + self.block_blob_service.delete_container(container_name) diff --git a/aztk/node_scripts/core/config.py b/aztk/node_scripts/core/config.py index 7766cb70..eb68a29a 100644 --- a/aztk/node_scripts/core/config.py +++ b/aztk/node_scripts/core/config.py @@ -63,10 +63,13 @@ def get_spark_client(): spark_client = get_spark_client() -# note: the batch_client and blob_client in _core_cluster_operations +# note: the batch_client and cloud_storage_account in _core_cluster_operations # is the same as in _core_job_operations batch_client = spark_client.cluster._core_cluster_operations.batch_client -blob_client = spark_client.cluster._core_cluster_operations.blob_client +cloud_storage_account = spark_client.cluster._core_cluster_operations.cloud_storage_account +block_blob_service = spark_client.cluster._core_cluster_operations.block_blob_service +file_service = spark_client.cluster._core_cluster_operations.file_service +table_service = spark_client.cluster._core_cluster_operations.table_service log.info("Pool id is %s", pool_id) log.info("Node id is %s", node_id) diff --git a/aztk/node_scripts/install/install.py b/aztk/node_scripts/install/install.py index ca262934..24d9dc65 100644 --- a/aztk/node_scripts/install/install.py +++ b/aztk/node_scripts/install/install.py @@ -8,7 +8,7 @@ def read_cluster_config(): - data = cluster_data.ClusterData(config.blob_client, config.cluster_id) + data = cluster_data.ClusterData(config.block_blob_service, config.cluster_id) cluster_config = data.read_cluster_config() log.info("Got cluster config: %s", cluster_config) return cluster_config diff --git a/aztk/node_scripts/scheduling/common.py b/aztk/node_scripts/scheduling/common.py index c5154736..514494f4 100644 --- a/aztk/node_scripts/scheduling/common.py +++ b/aztk/node_scripts/scheduling/common.py @@ -23,7 +23,7 @@ def load_application(application_file_path): return application -def upload_log(blob_client, application): +def upload_log(block_blob_service, application): """ upload output.log to storage account """ @@ -32,7 +32,7 @@ def upload_log(blob_client, application): container_name=os.environ["STORAGE_LOGS_CONTAINER"], application_name=application.name, file_path=log_file, - blob_client=blob_client, + block_blob_service=block_blob_service, use_full_path=False, ) @@ -40,13 +40,13 @@ def upload_log(blob_client, application): def upload_file_to_container(container_name, application_name, file_path, - blob_client=None, + block_blob_service=None, use_full_path=False, node_path=None) -> batch_models.ResourceFile: """ Uploads a local file to an Azure Blob storage container. - :param blob_client: A blob service client. - :type blob_client: `azure.storage.common.CloudStorageAccount` + :param block_blob_service: A blob service client. + :type block_blob_service: `azure.storage.blob.BlockBlobService` :param str container_name: The name of the Azure Blob storage container. :param str file_path: The local path to the file. :param str node_path: Path on the local node. By default will be the same as file_path @@ -54,7 +54,6 @@ def upload_file_to_container(container_name, :return: A ResourceFile initialized with a SAS URL appropriate for Batch tasks. """ - block_blob_client = blob_client.create_block_blob_service() file_path = file_path blob_name = None if use_full_path: @@ -66,25 +65,24 @@ def upload_file_to_container(container_name, if not node_path: node_path = blob_name - block_blob_client.create_container(container_name, fail_on_exist=False) + block_blob_service.create_container(container_name, fail_on_exist=False) - block_blob_client.create_blob_from_path(container_name, blob_path, file_path) + block_blob_service.create_blob_from_path(container_name, blob_path, file_path) - sas_token = block_blob_client.generate_blob_shared_access_signature( + sas_token = block_blob_service.generate_blob_shared_access_signature( container_name, blob_path, permission=blob.BlobPermissions.READ, expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7), ) - sas_url = block_blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) + sas_url = block_blob_service.make_blob_url(container_name, blob_path, sas_token=sas_token) return batch_models.ResourceFile(file_path=node_path, blob_source=sas_url) def upload_error_log(error, application_file_path): application = load_application(application_file_path) - blob_client = config.blob_client error_log_path = os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "error.log") with open(error_log_path, "w", encoding="UTF-8") as error_log: @@ -94,10 +92,10 @@ def upload_error_log(error, application_file_path): container_name=os.environ["STORAGE_LOGS_CONTAINER"], application_name=application.name, file_path=os.path.realpath(error_log.name), - blob_client=blob_client, + block_blob_service=config.block_blob_service, use_full_path=False, ) - upload_log(blob_client, application) + upload_log(config.block_blob_service, application) def download_task_definition(task_sas_url): @@ -106,36 +104,12 @@ def download_task_definition(task_sas_url): return yaml.load(yaml_serialized_task) -def stream_upload_to_storage( - blob_client: CloudStorageAccount, - stream, - application_name, -): - """ - Args: - blob_client (`azure.storage.common.CloudStorageAccount`) - stream (`obj:IOBase`): opened stream to upload as the blob content - application_name (`str`): the name of the application to uploads logs for - """ - from azure.storage.blob.appendblobservice import AppendBlobService - AppendBlobService.MAX_BLOCK_SIZE = 1024 * 1024 - - append_blob_client = blob_client.create_append_blob_service() - - append_blob_client.create_blob( - container_name=os.environ["STORAGE_LOGS_CONTAINER"], - blob_name=application_name + "/output.log", - if_none_match="*", - ) - append_blob_client.append_blob_from_stream( - container_name=os.environ["STORAGE_LOGS_CONTAINER"], - blob_name=application_name + "/output.log", - stream=stream, - ) - - def run_command(spark_client, command, application_name): process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - stream_upload_to_storage(spark_client.blob_client, process.stdout, application_name) - rc = process.poll() + rc = process.wait() return rc + + +def create_file_share(storage_client, cluster_id, quota=5120, fail_on_exist=True): + + st diff --git a/aztk/node_scripts/scheduling/submit.py b/aztk/node_scripts/scheduling/submit.py index 740a9c45..03eb3203 100644 --- a/aztk/node_scripts/scheduling/submit.py +++ b/aztk/node_scripts/scheduling/submit.py @@ -55,13 +55,13 @@ def receive_submit_request(application_file_path): """ Handle the request to submit a task """ - blob_client = config.blob_client application = common.load_application(application_file_path) cmd = __app_submit_cmd(application) exit_code = -1 try: exit_code = common.run_command(config.spark_client, cmd.to_str(), application.name) + common.upload_log(config.block_blob_service, application) except Exception as e: common.upload_error_log(str(e), os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml")) return exit_code diff --git a/aztk/spark/client/base/helpers/generate_application_task.py b/aztk/spark/client/base/helpers/generate_application_task.py index de255479..af630cbe 100644 --- a/aztk/spark/client/base/helpers/generate_application_task.py +++ b/aztk/spark/client/base/helpers/generate_application_task.py @@ -16,7 +16,7 @@ def generate_application_task(core_base_operations, container_id, application, r container_name=container_id, application_name=application.name, file_path=application.application, - blob_client=core_base_operations.blob_client, + block_blob_service=core_base_operations.block_blob_service, use_full_path=False, ) @@ -32,7 +32,7 @@ def generate_application_task(core_base_operations, container_id, application, r container_name=container_id, application_name=application.name, file_path=jar, - blob_client=core_base_operations.blob_client, + block_blob_service=core_base_operations.block_blob_service, use_full_path=False, ) jar_resource_file_paths.append(current_jar_resource_file_path) @@ -45,7 +45,7 @@ def generate_application_task(core_base_operations, container_id, application, r container_name=container_id, application_name=application.name, file_path=py_file, - blob_client=core_base_operations.blob_client, + block_blob_service=core_base_operations.block_blob_service, use_full_path=False, ) py_files_resource_file_paths.append(current_py_files_resource_file_path) @@ -58,7 +58,7 @@ def generate_application_task(core_base_operations, container_id, application, r container_name=container_id, application_name=application.name, file_path=file, - blob_client=core_base_operations.blob_client, + block_blob_service=core_base_operations.block_blob_service, use_full_path=False, ) files_resource_file_paths.append(files_resource_file_path) @@ -73,7 +73,7 @@ def generate_application_task(core_base_operations, container_id, application, r application_name=application.name, file_path="application.yaml", content=yaml.dump(application), - blob_client=core_base_operations.blob_client, + block_blob_service=core_base_operations.block_blob_service, ) resource_files.append(application_definition_file) diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py index 10e6c91b..7e53dcb3 100644 --- a/aztk/spark/client/cluster/helpers/submit.py +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -20,13 +20,13 @@ def affinitize_task_to_master(core_cluster_operations, spark_cluster_operations, return task -def upload_serialized_task_to_storage(blob_client, cluster_id, task): +def upload_serialized_task_to_storage(cloud_storage_account, cluster_id, task): return helpers.upload_text_to_container( container_name=cluster_id, application_name=task.id, file_path="task.yaml", content=yaml.dump(task), - blob_client=blob_client, + cloud_storage_account=cloud_storage_account, ) @@ -48,7 +48,7 @@ def schedule_with_target( internal, ): # upload "real" task definition to storage - serialized_task_resource_file = upload_serialized_task_to_storage(core_cluster_operations.blob_client, cluster_id, + serialized_task_resource_file = upload_serialized_task_to_storage(core_cluster_operations.block_blob_service, cluster_id, task) # # schedule "ghost" task ghost_task = batch_models.TaskAddParameter( diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index 6a00843f..298287a6 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -38,7 +38,7 @@ def generate_job_manager_task(core_job_operations, job, application_tasks): application_name=application.name + ".yaml", file_path=application.name + ".yaml", content=yaml.dump(task), - blob_client=core_job_operations.blob_client, + block_blob_service=core_job_operations.block_blob_service, ) resource_files.append(task_definition_resource_file) diff --git a/aztk/utils/azure_api.py b/aztk/utils/azure_api.py index 3b4f1f4a..ccfb17af 100644 --- a/aztk/utils/azure_api.py +++ b/aztk/utils/azure_api.py @@ -71,7 +71,7 @@ def make_batch_client(secrets): return batch_client -def make_blob_client(secrets): +def make_cloud_storage_account(secrets): """ Creates a blob client object :param str storage_account_key: storage account key @@ -81,10 +81,9 @@ def make_blob_client(secrets): if secrets.shared_key: # Set up SharedKeyCredentials - blob_client = blob.BlockBlobService( + cloud_storage_account = CloudStorageAccount( account_name=secrets.shared_key.storage_account_name, account_key=secrets.shared_key.storage_account_key, - endpoint_suffix=secrets.shared_key.storage_account_suffix, ) else: # Set up ServicePrincipalCredentials @@ -107,9 +106,9 @@ def make_blob_client(secrets): resource_group_name=resourcegroup, account_name=accountname, ).keys[0].value) - storage_client = CloudStorageAccount(accountname, key) + cloud_storage_account = CloudStorageAccount(accountname, key) - return storage_client + return cloud_storage_account def make_table_service(secrets): @@ -144,6 +143,7 @@ def make_table_service(secrets): return table_service +#TODO:replace with retry decorator def retry_function(function, retry_attempts: int, retry_interval: int, exception: Exception, *args, **kwargs): import time diff --git a/aztk/utils/helpers.py b/aztk/utils/helpers.py index 961dd190..a6c1681d 100644 --- a/aztk/utils/helpers.py +++ b/aztk/utils/helpers.py @@ -56,23 +56,25 @@ def wait_for_task_to_complete(job_id: str, task_id: str, batch_client): return -def upload_text_to_container(container_name: str, application_name: str, content: str, file_path: str, - blob_client=None) -> batch_models.ResourceFile: - block_blob_client = blob_client.create_block_blob_service() +def upload_text_to_container(container_name: str, + application_name: str, + content: str, + file_path: str, + block_blob_service=None) -> batch_models.ResourceFile: blob_name = file_path blob_path = application_name + "/" + blob_name # + '/' + time_stamp + '/' + blob_name - block_blob_client.create_container(container_name, fail_on_exist=False) - block_blob_client.create_blob_from_text(container_name, blob_path, content) + block_blob_service.create_container(container_name, fail_on_exist=False) + block_blob_service.create_blob_from_text(container_name, blob_path, content) - sas_token = block_blob_client.generate_blob_shared_access_signature( + sas_token = block_blob_service.generate_blob_shared_access_signature( container_name, blob_path, permission=blob.BlobPermissions.READ, expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365), ) - sas_url = block_blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) + sas_url = block_blob_service.make_blob_url(container_name, blob_path, sas_token=sas_token) return batch_models.ResourceFile(file_path=blob_name, blob_source=sas_url) @@ -80,13 +82,13 @@ def upload_text_to_container(container_name: str, application_name: str, content def upload_file_to_container(container_name, application_name, file_path, - blob_client=None, + block_blob_service=None, use_full_path=False, node_path=None) -> batch_models.ResourceFile: """ Uploads a local file to an Azure Blob storage container. - :param blob_client: A blob service client. - :type blob_client: `azure.storage.common.CloudStorageAccount` + :param block_blob_service: A blob service client. + :type block_blob_service: `azure.storage.common.BlockBlobService` :param str container_name: The name of the Azure Blob storage container. :param str file_path: The local path to the file. :param str node_path: Path on the local node. By default will be the same as file_path @@ -94,7 +96,6 @@ def upload_file_to_container(container_name, :return: A ResourceFile initialized with a SAS URL appropriate for Batch tasks. """ - block_blob_client = blob_client.create_block_blob_service() file_path = normalize_path(file_path) blob_name = None @@ -107,18 +108,18 @@ def upload_file_to_container(container_name, if not node_path: node_path = blob_name - block_blob_client.create_container(container_name, fail_on_exist=False) + block_blob_service.create_container(container_name, fail_on_exist=False) - block_blob_client.create_blob_from_path(container_name, blob_path, file_path) + block_blob_service.create_blob_from_path(container_name, blob_path, file_path) - sas_token = block_blob_client.generate_blob_shared_access_signature( + sas_token = block_blob_service.generate_blob_shared_access_signature( container_name, blob_path, permission=blob.BlobPermissions.READ, expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7), ) - sas_url = block_blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) + sas_url = block_blob_service.make_blob_url(container_name, blob_path, sas_token=sas_token) return batch_models.ResourceFile(file_path=node_path, blob_source=sas_url) @@ -245,18 +246,18 @@ def format_batch_exception(batch_exception): return "\n".join(l) -def save_cluster_config(cluster_config, blob_client): +def save_cluster_config(cluster_config, block_blob_service): blob_path = "config.yaml" content = yaml.dump(cluster_config) container_name = cluster_config.cluster_id - blob_client.create_container(container_name, fail_on_exist=False) - blob_client.create_blob_from_text(container_name, blob_path, content) + block_blob_service.create_container(container_name, fail_on_exist=False) + block_blob_service.create_blob_from_text(container_name, blob_path, content) -def read_cluster_config(cluster_id: str, blob_client: blob.BlockBlobService): +def read_cluster_config(cluster_id: str, block_blob_service: blob.BlockBlobService): blob_path = "config.yaml" try: - result = blob_client.get_blob_to_text(cluster_id, blob_path) + result = block_blob_service.get_blob_to_text(cluster_id, blob_path) return yaml.load(result.content) except azure.common.AzureMissingResourceHttpError: logging.warning("Cluster %s doesn't have cluster configuration in storage", cluster_id) From 024ed8c326aa89bbbd06f5955e0d2076dc0586b2 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 21 Nov 2018 17:09:21 -0800 Subject: [PATCH 24/28] start file_share implementation --- aztk/node_scripts/scheduling/common.py | 17 ++++++++++++++--- aztk/node_scripts/scheduling/submit.py | 5 ++--- setup.py | 3 ++- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/aztk/node_scripts/scheduling/common.py b/aztk/node_scripts/scheduling/common.py index 514494f4..b14773eb 100644 --- a/aztk/node_scripts/scheduling/common.py +++ b/aztk/node_scripts/scheduling/common.py @@ -104,12 +104,23 @@ def download_task_definition(task_sas_url): return yaml.load(yaml_serialized_task) -def run_command(spark_client, command, application_name): +def run_command(command, cluster_id, application_name): process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stream_upload_to_file_share(cluster_id, application_name, process.stdout) rc = process.wait() return rc -def create_file_share(storage_client, cluster_id, quota=5120, fail_on_exist=True): +def create_file_share(cluster_id, application_id, quota=5120, fail_on_exist=True): + config.file_service.create_share(share_name=cluster_id + application_id, quota=quota, fail_on_exist=fail_on_exist) - st + +def stream_upload_to_file_share(cluster_id, application_name, stream): + create_file_share(cluster_id, application_name) + config.file_service.create_file_from_stream( + share_name=cluster_id + application_name, + directory_name="", + file_name="output.log", + stream=stream, + max_connections=4, + ) diff --git a/aztk/node_scripts/scheduling/submit.py b/aztk/node_scripts/scheduling/submit.py index 03eb3203..643da8c5 100644 --- a/aztk/node_scripts/scheduling/submit.py +++ b/aztk/node_scripts/scheduling/submit.py @@ -60,8 +60,7 @@ def receive_submit_request(application_file_path): cmd = __app_submit_cmd(application) exit_code = -1 try: - exit_code = common.run_command(config.spark_client, cmd.to_str(), application.name) - common.upload_log(config.block_blob_service, application) + exit_code = common.run_command(cmd.to_str(), config.cluster_id, application.name) except Exception as e: common.upload_error_log(str(e), os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml")) return exit_code @@ -81,7 +80,7 @@ def ssh_submit(task_sas_url): # update task table before running task = scheduling_target.insert_task_into_task_table(aztk_cluster_id, task_definition) # run task and upload log - exit_code = common.run_command(config.spark_client, cmd.to_str(), application.name) + exit_code = common.run_command(cmd.to_str(), config.cluster_id, application.name) log.info("completed application, updating storage table") scheduling_target.mark_task_complete(aztk_cluster_id, task.id, exit_code) except Exception as e: diff --git a/setup.py b/setup.py index a4671f4b..6ca9f79c 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,8 @@ def find_package_files(root, directory, dest=""): "azure-batch~=5.1.0", "azure-mgmt-batch~=5.0.1", "azure-mgmt-storage~=3.0.0", - "azure-storage-blob~=1.3.1", + "azure-storage-blob~=1.4.0", + "azure-storage-file~=1.4.0" "azure-cosmosdb-table~=1.0.5", "pycryptodomex~=3.6.6", "PyYAML~=3.13", From 9faa691a0a1f99ceada1e35e5c6416f4623e099f Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 28 Nov 2018 16:25:33 -0800 Subject: [PATCH 25/28] factor log streaming code out --- aztk/client/base/base_operations.py | 3 - .../base/helpers/create_batch_resources.py | 3 +- .../base/helpers/get_application_log.py | 164 +++++++++--------- aztk/client/base/helpers/task_table.py | 2 +- aztk/client/client.py | 4 - aztk/node_scripts/core/config.py | 1 - aztk/node_scripts/scheduling/common.py | 24 +-- .../node_scripts/scheduling/job_submission.py | 3 +- aztk/node_scripts/scheduling/submit.py | 4 +- aztk/spark/client/cluster/helpers/submit.py | 4 +- 10 files changed, 94 insertions(+), 118 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index fbffcc3d..1d6001fe 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -16,8 +16,6 @@ class BaseOperations: cloud_storage_account (:obj:`azure.storage.blob.CloudStorageAccount`): Azure Storage account used block_blob_service (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the Azure Storage Blob service. - file_service (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the Azure Storage - File service. table_service (:obj:`azure.storage.blob.CloudStorageAccount`): Client used to interact with the Azure Storage Table service. secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): @@ -28,7 +26,6 @@ def __init__(self, context): self.batch_client = context["batch_client"] self.cloud_storage_account = context["cloud_storage_account"] self.block_blob_service = context["block_blob_service"] - self.file_service = context["file_service"] self.table_service = context["table_service"] self.secrets_configuration = context["secrets_configuration"] diff --git a/aztk/client/base/helpers/create_batch_resources.py b/aztk/client/base/helpers/create_batch_resources.py index 13382111..44fd1be5 100644 --- a/aztk/client/base/helpers/create_batch_resources.py +++ b/aztk/client/base/helpers/create_batch_resources.py @@ -50,7 +50,8 @@ def create_batch_resources( metadata=[ batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA) + name=constants.AZTK_MODE_METADATA_KEY, + value=constants.AZTK_JOB_MODE_METADATA) # dyanmically change to cluster/job metadata ]), ) diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index 8bf27fb5..0a7c270c 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -1,23 +1,28 @@ -import tempfile import time import azure import azure.batch.models as batch_models +from azure.batch.models import BatchErrorException from aztk import error, models from aztk.models import Task, TaskState -from aztk.utils import batch_error_manager, constants +from aztk.utils import batch_error_manager, constants, helpers +output_file = constants.TASK_WORKING_DIR + "/" + constants.SPARK_SUBMIT_LOGS_FILE -def convert_application_name_to_blob_path(application_name): - return application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE + +def __check_task_node_exist(batch_client, cluster_id: str, task: Task) -> bool: + try: + batch_client.compute_node.get(cluster_id, task.node_id) + return True + except BatchErrorException: + return False def wait_for_task(base_operations, cluster_id, application_name): # TODO: ensure get_task_state not None or throw task = base_operations.get_task(cluster_id, application_name) - while task.state not in [TaskState.Completed, TaskState.Failed, TaskState.Running]: - print(task.state) + while task.state not in [TaskState.Completed, TaskState.Failed]: time.sleep(3) # TODO: enable logger # log.debug("{} {}: application not yet complete".format(cluster_id, application_name)) @@ -25,105 +30,100 @@ def wait_for_task(base_operations, cluster_id, application_name): return task -def get_blob_from_storage(block_blob_service, container_name, application_name, stream, start_range, end_range=None): - print(block_blob_service, container_name, application_name, stream, start_range, end_range) - previous = 0 - - def download_callback(current, total): - nonlocal previous - stream.seek(previous) - print("({}/{})".format(previous, current)) - # print(stream.read().decode('utf-8')) # SDK SHOULDN'T PRINT - previous = current - - try: - blob = block_blob_service.get_blob_to_stream( - container_name, - convert_application_name_to_blob_path(application_name), - stream, - progress_callback=download_callback, - start_range=start_range, - end_range=end_range) - stream.seek(0) - return blob - except azure.common.AzureMissingResourceHttpError: - raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") - except azure.common.AzureHttpError as e: - if e.error_code in ["InvalidRange"]: - # the blob has no data, should not throw here - raise error.AztkError("The application {} log has no data yet.".format(application_name)) - raise - - -def get_log_from_storage(block_blob_service, container_name, application_name, task, current_bytes): - stream = tempfile.TemporaryFile() - blob = get_blob_from_storage(block_blob_service, container_name, application_name, stream, current_bytes) - return models.ApplicationLog( - name=application_name, - cluster_id=container_name, - application_state=task.state, - log=stream, - total_bytes=blob.properties.content_length, - exit_code=task.exit_code, - ) +def __get_output_file_properties(batch_client, cluster_id: str, application_name: str): + while True: + try: + file = helpers.get_file_properties(cluster_id, application_name, output_file, batch_client) + return file + except BatchErrorException as e: + if e.response.status_code == 404: + # TODO: log + time.sleep(5) + continue + else: + raise e -def stream_log_from_storage(base_operations, container_name, application_name, task): +def get_log_from_storage(block_blob_service, container_name, application_name, task): """ Args: - base_operations (:obj:`aztk.client.base.BaseOperations`): Base aztk client + block_blob_service (:obj:`azure.storage.blob.BlockBlobService`): Client used to interact with the Azure Storage + Blob service. container_name (:obj:`str`): the name of the Azure Blob storage container to get data from application_name (:obj:`str`): the name of the application to get logs for task (:obj:`aztk.models.Task`): the aztk task for for this application """ - stream = tempfile.TemporaryFile() - last_read_byte = 0 - - blob = get_blob_from_storage( - base_operations.block_blob_service, - container_name, - application_name, - stream, - start_range=last_read_byte, - end_range=last_read_byte + constants.STREAMING_DOWNLOAD_CHUNK_SIZE, - ) - - while task.state not in [TaskState.Completed, TaskState.Failed]: - print(container_name, task.id) - task = base_operations.get_task(container_name, task.id) - last_read_byte = blob.properties.content_length - blob = get_blob_from_storage( - base_operations.block_blob_service, - container_name, - application_name, - stream, - start_range=last_read_byte, - ) - - stream.seek(0) + try: + blob = block_blob_service.get_blob_to_text(container_name, + application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE) + except azure.common.AzureMissingResourceHttpError: + raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") return models.ApplicationLog( name=application_name, cluster_id=container_name, application_state=task.state, - log=stream, + log=blob.content, total_bytes=blob.properties.content_length, exit_code=task.exit_code, ) -def get_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - task = wait_for_task(base_operations, cluster_id, application_name) +def wait_for_scheduling_target_task(base_operations, cluster_id, application_name): + application_state = base_operations.get_task_state(cluster_id, application_name) + while TaskState(application_state) not in [TaskState.Completed, TaskState.Failed]: + time.sleep(3) + print("Application {}: State {}".format(TaskState(application_state), application_name)) + # TODO: enable logger + # log.debug("{} {}: application not yet complete".format(cluster_id, application_name)) + application_state = base_operations.get_task_state(cluster_id, application_name) + return base_operations.get_task(cluster_id, application_name) - return get_log_from_storage(base_operations.block_blob_service, cluster_id, application_name, task, current_bytes) +def get_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + job_id = cluster_id + task_id = application_name + cluster_configuration = base_operations.get_cluster_configuration(cluster_id) -def stream_log(base_operations, cluster_id: str, application_name: str): task = wait_for_task(base_operations, cluster_id, application_name) - return stream_log_from_storage(base_operations, cluster_id, application_name, task) + if cluster_configuration.scheduling_target is not models.SchedulingTarget.Any: + return get_log_from_storage(base_operations.block_blob_service, cluster_id, application_name, task) + else: + if not __check_task_node_exist(base_operations.batch_client, cluster_id, task): + return get_log_from_storage(base_operations.block_blob_service, cluster_id, application_name, task) + + file = __get_output_file_properties(base_operations.batch_client, cluster_id, application_name) + target_bytes = file.content_length + + if target_bytes != current_bytes: + ocp_range = None + + if tail: + ocp_range = "bytes={0}-{1}".format(current_bytes, target_bytes - 1) + + stream = base_operations.batch_client.file.get_from_task( + job_id, task_id, output_file, batch_models.FileGetFromTaskOptions(ocp_range=ocp_range)) + content = helpers.read_stream_as_string(stream) + + return models.ApplicationLog( + name=application_name, + cluster_id=cluster_id, + application_state=task.state, + log=content, + total_bytes=target_bytes, + exit_code=task.exit_code, + ) + else: + return models.ApplicationLog( + name=application_name, + cluster_id=cluster_id, + application_state=task.state, + log="", + total_bytes=target_bytes, + exit_code=task.exit_code, + ) def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): with batch_error_manager(): - # return get_log(base_operations, cluster_id, application_name, tail, current_bytes) - return stream_log(base_operations, cluster_id, application_name) + return get_log(base_operations, cluster_id, application_name, tail, current_bytes) diff --git a/aztk/client/base/helpers/task_table.py b/aztk/client/base/helpers/task_table.py index 0425ba1c..5dd9da25 100644 --- a/aztk/client/base/helpers/task_table.py +++ b/aztk/client/base/helpers/task_table.py @@ -90,7 +90,7 @@ def list_task_table_entries(table_service, id): def get_task_from_table(table_service, id, task_id): entity = table_service.get_entity(helpers.convert_id_to_table_id(id), id, task_id) # TODO: enable logger - print("Running get_task_from_table: {}".format(entity)) + # print("Running get_task_from_table: {}".format(entity)) return __convert_entity_to_task(entity) diff --git a/aztk/client/client.py b/aztk/client/client.py index 57832be6..b7a2e5bf 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -14,8 +14,6 @@ def __init__(self): self.secrets_configuration = None self.batch_client = None self.cloud_storage_account = None - self.file_service = None - self.block_blob_service = None self.table_service = None def _get_context(self, secrets_configuration: models.SecretsConfiguration): @@ -24,13 +22,11 @@ def _get_context(self, secrets_configuration: models.SecretsConfiguration): azure_api.validate_secrets(secrets_configuration) self.batch_client = azure_api.make_batch_client(secrets_configuration) self.cloud_storage_account = azure_api.make_cloud_storage_account(secrets_configuration) - self.file_service = self.cloud_storage_account.create_file_service() self.block_blob_service = self.cloud_storage_account.create_block_blob_service() self.table_service = azure_api.make_table_service(secrets_configuration) context = { "batch_client": self.batch_client, "cloud_storage_account": self.cloud_storage_account, - "file_service": self.file_service, "block_blob_service": self.block_blob_service, "table_service": self.table_service, "secrets_configuration": self.secrets_configuration, diff --git a/aztk/node_scripts/core/config.py b/aztk/node_scripts/core/config.py index eb68a29a..342d0db7 100644 --- a/aztk/node_scripts/core/config.py +++ b/aztk/node_scripts/core/config.py @@ -68,7 +68,6 @@ def get_spark_client(): batch_client = spark_client.cluster._core_cluster_operations.batch_client cloud_storage_account = spark_client.cluster._core_cluster_operations.cloud_storage_account block_blob_service = spark_client.cluster._core_cluster_operations.block_blob_service -file_service = spark_client.cluster._core_cluster_operations.file_service table_service = spark_client.cluster._core_cluster_operations.table_service log.info("Pool id is %s", pool_id) diff --git a/aztk/node_scripts/scheduling/common.py b/aztk/node_scripts/scheduling/common.py index b14773eb..1995e5a1 100644 --- a/aztk/node_scripts/scheduling/common.py +++ b/aztk/node_scripts/scheduling/common.py @@ -104,23 +104,7 @@ def download_task_definition(task_sas_url): return yaml.load(yaml_serialized_task) -def run_command(command, cluster_id, application_name): - process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - stream_upload_to_file_share(cluster_id, application_name, process.stdout) - rc = process.wait() - return rc - - -def create_file_share(cluster_id, application_id, quota=5120, fail_on_exist=True): - config.file_service.create_share(share_name=cluster_id + application_id, quota=quota, fail_on_exist=fail_on_exist) - - -def stream_upload_to_file_share(cluster_id, application_name, stream): - create_file_share(cluster_id, application_name) - config.file_service.create_file_from_stream( - share_name=cluster_id + application_name, - directory_name="", - file_name="output.log", - stream=stream, - max_connections=4, - ) +def run_command(command, cluster_id, application): + return_code = subprocess.call(command, shell=True) + upload_log(config.block_blob_service, application) + return return_code diff --git a/aztk/node_scripts/scheduling/job_submission.py b/aztk/node_scripts/scheduling/job_submission.py index d528a7ac..1180fba2 100644 --- a/aztk/node_scripts/scheduling/job_submission.py +++ b/aztk/node_scripts/scheduling/job_submission.py @@ -29,8 +29,7 @@ def read_downloaded_tasks(): def affinitize_task_to_master(cluster_id, task): cluster = config.spark_client.cluster.get(id=cluster_id) - master_node = master_node = config.batch_client.compute_node.get( - pool_id=cluster.pool.id, node_id=cluster.master_node_id) + master_node = config.batch_client.compute_node.get(pool_id=cluster.pool.id, node_id=cluster.master_node_id) task.affinity_info = batch_models.AffinityInformation(affinity_id=master_node.affinity_id) return task diff --git a/aztk/node_scripts/scheduling/submit.py b/aztk/node_scripts/scheduling/submit.py index 643da8c5..b2e0b2d8 100644 --- a/aztk/node_scripts/scheduling/submit.py +++ b/aztk/node_scripts/scheduling/submit.py @@ -60,7 +60,7 @@ def receive_submit_request(application_file_path): cmd = __app_submit_cmd(application) exit_code = -1 try: - exit_code = common.run_command(cmd.to_str(), config.cluster_id, application.name) + exit_code = common.run_command(cmd.to_str(), config.cluster_id, application) except Exception as e: common.upload_error_log(str(e), os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml")) return exit_code @@ -80,7 +80,7 @@ def ssh_submit(task_sas_url): # update task table before running task = scheduling_target.insert_task_into_task_table(aztk_cluster_id, task_definition) # run task and upload log - exit_code = common.run_command(cmd.to_str(), config.cluster_id, application.name) + exit_code = common.run_command(cmd.to_str(), config.cluster_id, application) log.info("completed application, updating storage table") scheduling_target.mark_task_complete(aztk_cluster_id, task.id, exit_code) except Exception as e: diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py index 7e53dcb3..78e33241 100644 --- a/aztk/spark/client/cluster/helpers/submit.py +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -20,13 +20,13 @@ def affinitize_task_to_master(core_cluster_operations, spark_cluster_operations, return task -def upload_serialized_task_to_storage(cloud_storage_account, cluster_id, task): +def upload_serialized_task_to_storage(block_blob_service, cluster_id, task): return helpers.upload_text_to_container( container_name=cluster_id, application_name=task.id, file_path="task.yaml", content=yaml.dump(task), - cloud_storage_account=cloud_storage_account, + block_blob_service=block_blob_service, ) From b2880cd106a9faf55b00def8d8d7dda359193619 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 28 Nov 2018 16:34:41 -0800 Subject: [PATCH 26/28] remove azure file dependency --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 6ca9f79c..cff07793 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,6 @@ def find_package_files(root, directory, dest=""): "azure-mgmt-batch~=5.0.1", "azure-mgmt-storage~=3.0.0", "azure-storage-blob~=1.4.0", - "azure-storage-file~=1.4.0" "azure-cosmosdb-table~=1.0.5", "pycryptodomex~=3.6.6", "PyYAML~=3.13", From 17f808e0f7c20f7eaba61f98f7604e66458cbc37 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 28 Nov 2018 16:38:26 -0800 Subject: [PATCH 27/28] remove ghost task, fix yapf --- aztk/client/cluster/helpers/copy.py | 1 - aztk/client/cluster/helpers/get.py | 1 + aztk/client/cluster/helpers/list.py | 1 + aztk/client/job/helpers/get.py | 2 +- aztk/spark/client/cluster/helpers/submit.py | 11 ++--------- 5 files changed, 5 insertions(+), 11 deletions(-) diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py index 31f3e47c..25352679 100644 --- a/aztk/client/cluster/helpers/copy.py +++ b/aztk/client/cluster/helpers/copy.py @@ -24,7 +24,6 @@ def cluster_copy( else: cluster_nodes = [(node, cluster_operations.get_remote_login_settings(cluster_id, node.id)) for node in nodes] - with batch_error_manager(): generated_username, ssh_key = cluster_operations.generate_user_on_cluster(pool.id, nodes) diff --git a/aztk/client/cluster/helpers/get.py b/aztk/client/cluster/helpers/get.py index d8762921..146d2e8e 100644 --- a/aztk/client/cluster/helpers/get.py +++ b/aztk/client/cluster/helpers/get.py @@ -7,6 +7,7 @@ def convert_job_id_to_pool_id(batch_client, cluster_id): return job.execution_info.pool_id raise error.AztkError("No cluster with id {} does not exist.".format(cluster_id)) + def get_pool_details(core_cluster_operations, cluster_id: str): """ Print the information for the given cluster diff --git a/aztk/client/cluster/helpers/list.py b/aztk/client/cluster/helpers/list.py index b1e06731..087e369b 100644 --- a/aztk/client/cluster/helpers/list.py +++ b/aztk/client/cluster/helpers/list.py @@ -6,6 +6,7 @@ def job_get_pool(core_cluster_operations, job): if job.execution_info and job.execution_info.pool_id: return core_cluster_operations.batch_client.pool.get(job.execution_info.pool_id) + def list_clusters(core_cluster_operations, software_metadata_key): """ List all the cluster on your account. diff --git a/aztk/client/job/helpers/get.py b/aztk/client/job/helpers/get.py index ebe9f1c4..ce7136ff 100644 --- a/aztk/client/job/helpers/get.py +++ b/aztk/client/job/helpers/get.py @@ -1 +1 @@ -# TODO: pass \ No newline at end of file +# TODO: pass diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py index 78e33241..b8b174f8 100644 --- a/aztk/spark/client/cluster/helpers/submit.py +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -48,15 +48,8 @@ def schedule_with_target( internal, ): # upload "real" task definition to storage - serialized_task_resource_file = upload_serialized_task_to_storage(core_cluster_operations.block_blob_service, cluster_id, - task) - # # schedule "ghost" task - ghost_task = batch_models.TaskAddParameter( - id=task.id, - command_line="/bin/bash", - ) - # tell the node to run the task - core_cluster_operations.batch_client.task.add(cluster_id, task=ghost_task) + serialized_task_resource_file = upload_serialized_task_to_storage(core_cluster_operations.block_blob_service, + cluster_id, task) task_working_dir = "/mnt/aztk/startup/tasks/workitems/{}".format(task.id) From e96a411c07067526bb6a10a256184129224aaa1e Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 28 Nov 2018 16:42:37 -0800 Subject: [PATCH 28/28] yapf --- aztk_cli/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aztk_cli/utils.py b/aztk_cli/utils.py index 40d16682..e3d6521b 100644 --- a/aztk_cli/utils.py +++ b/aztk_cli/utils.py @@ -237,7 +237,7 @@ def print_batch_exception(batch_exception): for mesg in batch_exception.error.values: log.error("%s:\t%s", mesg.key, mesg.value) log.error("-------------------------------------------") - + def print_jobs(jobs: List[models.Job]): print_format = "{:<34}| {:<10}| {:<20}"