Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add debugging dashboard url to workload creation output #48

Merged
merged 4 commits into from
Dec 14, 2023
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 92 additions & 18 deletions xpk.py
Original file line number Diff line number Diff line change
Expand Up @@ -1887,51 +1887,113 @@ def get_main_container(args, system, docker_image, command) -> str:
docker_image=docker_image,
command=command)

def get_gke_outlier_dashboard(args):
"""Get the identifier of GKE outlier dashboard deployed in the project.
def get_gke_dashboard(args, dashboard_filter):
"""Get the identifier of GKE dashboard deployed in the project.

Args:
args: user provided arguments for running the command.

Returns:
bool:
True if 'gcloud monitoring dashboards list' returned an error or
multiple dashboards with same filter exist in the project,
False otherwise.
str:
identifier of outlier dashbord if deployed in project,
identifier of dashboard if deployed in project,
None otherwise.
"""
outlier_dashboard_filter = "displayName:'GKE - TPU Monitoring Dashboard'"
command = (
'gcloud monitoring dashboards list'
f' --project={args.project} --filter="{outlier_dashboard_filter}" --format="value(name)" --verbosity=error'
f' --project={args.project} --filter="{dashboard_filter}" --format="value(name)" --verbosity=error'
)

return_code, return_value = run_command_for_value(command, 'GKE Dashboard List', args)
Obliviour marked this conversation as resolved.
Show resolved Hide resolved

if return_code != 0:
xpk_print(f'GKE Dashboard List request returned ERROR {return_code}')
return None
xpk_print(f'GKE Dashboard List request returned ERROR {return_code}. '
'If there is a permissions error, please check '
'https://github.com/google/xpk/blob/main/README.md#roles-needed-based-on-permission-errors '
'for possible solutions.')
return True, None

if not return_value:
xpk_print(
f'No dashboard with {outlier_dashboard_filter} found in the'
f' project:{args.project}.'
f'No dashboard with {dashboard_filter} found in the project:{args.project}.'
)
return False, return_value

dashboards = return_value.strip().split('\n')
if len(dashboards) > 1:
xpk_print(
f'Multiple dashboards with same {dashboard_filter} exist in the project:{args.project}. '
'Delete all but one dashboard deployed using https://github.com/google/cloud-tpu-monitoring-debugging.'
)
return True, None

if dashboards[0]:
return False, dashboards[0].strip().split('/')[-1]

return True, None

def get_gke_outlier_dashboard(args):
"""Get the identifier of GKE outlier dashboard deployed in the project.

Args:
args: user provided arguments for running the command.

Returns:
bool:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think get_gke_outlier_dashboard only returns a str

True if 'gcloud monitoring dashboards list' returned an error or
multiple dashboards with same filter exist in the project,
False otherwise.
str:
identifier of outlier dashboard if deployed in project,
None otherwise.
"""
outlier_dashboard_filter = "displayName:'GKE - TPU Monitoring Dashboard'"
is_error, dashboard_id = get_gke_dashboard(args, outlier_dashboard_filter)

# 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
if is_error:
return None

# 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
if not is_error and not dashboard_id:
xpk_print(
'Follow https://github.com/google/cloud-tpu-monitoring-debugging'
' to deploy monitoring dashboard to view statistics and outlier mode of GKE metrics.'
)
return None

dashboards = return_value.strip().split('\n')
if len(dashboards) > 1:
return dashboard_id

def get_gke_debugging_dashboard(args):
"""Get the identifier of GKE debugging dashboard deployed in the project.

Args:
args: user provided arguments for running the command.

Returns:
str:
identifier of debugging dashboard if deployed in project,
None otherwise.
"""
debugging_dashboard_filter = "displayName:'GKE - TPU Logging Dashboard'"
is_error, dashboard_id = get_gke_dashboard(args, debugging_dashboard_filter)

# 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
if is_error:
return None

# 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
if not is_error and not dashboard_id:
xpk_print(
f'Multiple dashboards with same {outlier_dashboard_filter} exist in the project:{args.project}.'
'Delete all but one monitoring dashboard deployed using https://github.com/google/cloud-tpu-monitoring-debugging.'
'Follow https://github.com/google/cloud-tpu-monitoring-debugging'
' to deploy debugging dashboard to view stack traces collected in Cloud Logging.'
)
return None

if dashboards[0]:
return dashboards[0].strip().split('/')[-1]

return None
return dashboard_id


def workload_create(args) -> int:
Expand Down Expand Up @@ -1971,9 +2033,12 @@ def workload_create(args) -> int:
command += ('; WORKER_ID=$HOSTNAME;'
f'gsutil cp -r /tmp/xla_dump/ {args.debug_dump_gcs}/$WORKER_ID')

debugging_dashboard_id = None
if args.deploy_stacktrace_sidecar:
xpk_print('Sidecar container to display stack traces will also be deployed.')
container = get_main_and_sidecar_container(args, system, docker_image, command)
# Get GKE debugging dashboard only when sidecar container is deployed
debugging_dashboard_id = get_gke_debugging_dashboard(args)
else:
container = get_main_container(args, system, docker_image, command)

Expand Down Expand Up @@ -2008,6 +2073,14 @@ def workload_create(args) -> int:
f' To view the metric data for your workload, select {args.workload} from the JobName filter on the dashboard.'
)

if debugging_dashboard_id is not None:
bvandermoon marked this conversation as resolved.
Show resolved Hide resolved
xpk_print(
'Check stack traces collected in Cloud Logging here:'
# pylint: disable=line-too-long
f' https://console.cloud.google.com/monitoring/dashboards/builder/{debugging_dashboard_id}?project={args.project}&f.rlabel.cluster_name.ClusterName={args.cluster}.'
f' To view the stack traces for your workload, select {args.workload} from the JobName filter on the dashboard.'
)

xpk_exit(0)


Expand Down Expand Up @@ -2654,7 +2727,8 @@ def directory_path_type(value):
action='store_true',
help=(
'Add this argument to deploy a sidecar container that will '
'read the stack traces collected in /tmp/debugging directory.'
'read the stack traces collected in /tmp/debugging directory '
'and forward them to Cloud Logging.'
),
)

Expand Down