Skip to content

Commit

Permalink
Configuration to deploy a sidecar container to display stack traces (#36
Browse files Browse the repository at this point in the history
)

* Add configuration for stack trace sidecar container

* Add configuration for stack trace sidecar container

* Add configuration for stack trace sidecar container

* Add configuration for stack trace sidecar container

* Add configuration for stack trace sidecar container

* Add configuration for stack trace sidecar container

* Update README
  • Loading branch information
SurbhiJainUSC authored Dec 12, 2023
1 parent 9d74584 commit 0d5fce3
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 17 deletions.
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -415,3 +415,37 @@ gcloud beta compute reservations list --project=$PROJECT_ID
# Find the tpu machine type and current utilization of a reservation.
gcloud beta compute reservations describe $RESERVATION --project=$PROJECT_ID --zone=$ZONE
```
# Workload Debugging
## Collect Stack Traces
[cloud-tpu-diagnostics](https://pypi.org/project/cloud-tpu-diagnostics/) PyPI package can be used to generate stack traces for workloads running in GKE. This package dumps the Python traces when a fault such as segmentation fault, floating-point exception, or illegal operation exception occurs in the program. Additionally, it will also periodically collect stack traces to help you debug situations when the program is unresponsive. You must make the following changes in the docker image running in a Kubernetes main container to enable periodic stack trace collection.
```shell
# main.py
from cloud_tpu_diagnostics import diagnostic
from cloud_tpu_diagnostics.configuration import debug_configuration
from cloud_tpu_diagnostics.configuration import diagnostic_configuration
from cloud_tpu_diagnostics.configuration import stack_trace_configuration
stack_trace_config = stack_trace_configuration.StackTraceConfig(
collect_stack_trace = True,
stack_trace_to_cloud = True)
debug_config = debug_configuration.DebugConfig(
stack_trace_config = stack_trace_config)
diagnostic_config = diagnostic_configuration.DiagnosticConfig(
debug_config = debug_config)
with diagnostic.diagnose(diagnostic_config):
main_method() # this is the main method to run
```
This configuration will start collecting stack traces inside the `/tmp/debugging` directory on each Kubernetes Pod.
### Explore Stack Traces
To explore the stack traces collected in a temporary directory in Kubernetes Pod, you can run the following command to configure a sidecar container that will read the traces from `/tmp/debugging` directory.
```shell
python3 xpk.py workload create \
--workload xpk-test-workload --command "python3 main.py" --cluster \
xpk-test --tpu-type=v5litepod-16 --deploy-stacktrace-sidecar
```
98 changes: 81 additions & 17 deletions xpk.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,22 +98,7 @@
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: {args.docker_name}
image: {docker_image}
env: {args.env}
ports:
- containerPort: 8471
- containerPort: 8080
securityContext:
privileged: true
command:
- bash
- -c
- |
echo XPK Start: $(date) ; {command} ; EXIT_CODE=$? ; echo XPK End: $(date); echo EXIT_CODE=$EXIT_CODE ; sleep 5; exit $EXIT_CODE
resources:
limits:
google.com/tpu: {system.chips_per_vm}
{container}
"""

workload_delete_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
Expand Down Expand Up @@ -1839,6 +1824,69 @@ def setup_docker_image(args) -> tuple[int, str]:

return 0, docker_image

def get_main_and_sidecar_container(args, system, docker_image, command) -> str:
"""Generate yaml for main and sidecar container.
Args:
args: user provided arguments for running the command.
system: system characteristics
docker_image: docker image
command: command to run in the main container
Returns:
str:
yaml for main and sidecar container
"""
main_container = get_main_container(args, system, docker_image, command)
yaml = """- name: stacktrace-explorer
image: busybox:1.28
args: [/bin/sh, -c, "while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*"]
volumeMounts:
- name: tpu-stack-trace
readOnly: true
mountPath: /tmp/debugging
{main_container}
volumeMounts:
- name: tpu-stack-trace
mountPath: /tmp/debugging
volumes:
- name: tpu-stack-trace
"""
return yaml.format(main_container=main_container)

def get_main_container(args, system, docker_image, command) -> str:
"""Generate yaml for main container.
Args:
args: user provided arguments for running the command.
system: system characteristics
docker_image: docker image
command: command to run in the main container
Returns:
str:
yaml for main container
"""
yaml = """- name: {args.docker_name}
image: {docker_image}
env: {args.env}
ports:
- containerPort: 8471
- containerPort: 8080
securityContext:
privileged: true
command:
- bash
- -c
- |
echo XPK Start: $(date) ; {command} ; EXIT_CODE=$? ; echo XPK End: $(date); echo EXIT_CODE=$EXIT_CODE ; sleep 5; exit $EXIT_CODE
resources:
limits:
google.com/tpu: {system.chips_per_vm}
"""
return yaml.format(args=args,
system=system,
docker_image=docker_image,
command=command)

def get_gke_outlier_dashboard(args):
"""Get the identifier of GKE outlier dashboard deployed in the project.
Expand Down Expand Up @@ -1923,10 +1971,17 @@ def workload_create(args) -> int:
command += ('; WORKER_ID=$HOSTNAME;'
f'gsutil cp -r /tmp/xla_dump/ {args.debug_dump_gcs}/$WORKER_ID')

if args.deploy_stacktrace_sidecar:
xpk_print('Sidecar container to display stack traces will also be deployed.')
container = get_main_and_sidecar_container(args, system, docker_image, command)
else:
container = get_main_container(args, system, docker_image, command)

yml_string = workload_create_yaml.format(args=args,
system=system,
docker_image=docker_image,
command=command)
command=command,
container=container)
tmp = write_temporary_file(yml_string)
command = f'kubectl apply -f {str(tmp.file.name)}'

Expand Down Expand Up @@ -2594,6 +2649,15 @@ def directory_path_type(value):
),
)

workload_create_parser_optional_arguments.add_argument(
'--deploy-stacktrace-sidecar',
action='store_true',
help=(
'Add this argument to deploy a sidecar container that will '
'read the stack traces collected in /tmp/debugging directory.'
),
)

workload_create_parser.set_defaults(func=workload_create)

# "job delete" command parser.
Expand Down

0 comments on commit 0d5fce3

Please sign in to comment.