Skip to content

Commit

Permalink
Add configuration for stack trace sidecar container
Browse files Browse the repository at this point in the history
  • Loading branch information
SurbhiJainUSC committed Nov 30, 2023
1 parent 933c8a0 commit 50dacd3
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 17 deletions.
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,38 @@ all zones.
xpk-test --tpu-type=v5litepod-16 --priority=medium
```

### Workload Debugging
#### Collect Stack Traces
`cloud-tpu-diagnostics` PyPI package can be used to generate stack traces for workloads running in GKE. This package dumps the Python traces when a fault such as segmentation fault, floating-point exception, or illegal operation exception occurs in the program. Additionally, it will also periodically collect stack traces to help you debug situations when the program is unresponsive. You must make the following changes in the docker image running in a Kubernetes main container.
```shell
# main.py
from cloud_tpu_diagnostics import diagnostic
from cloud_tpu_diagnostics.configuration import debug_configuration
from cloud_tpu_diagnostics.configuration import diagnostic_configuration
from cloud_tpu_diagnostics.configuration import stack_trace_configuration
stack_trace_config = stack_trace_configuration.StackTraceConfig(
collect_stack_trace = True,
stack_trace_to_cloud = True)
debug_config = debug_configuration.DebugConfig(
stack_trace_config = stack_trace_config)
diagnostic_config = diagnostic_configuration.DiagnosticConfig(
debug_config = debug_config)
with diagnostic.diagnose(diagnostic_config):
main_method() # this is the main method to run
```
This configuration will start collecting stack traces inside the `/tmp/debugging` directory on each Kubernetes Pod.

#### Explore Stack Traces
To explore the stack traces collected in a temporary directory in Kubernetes Pod, you can run the following command to configure a sidecar container that will read the traces from `/tmp/debugging` directory.
```shell
python3 xpk.py workload create \
--workload xpk-test-workload --command "python3 main.py" --cluster \
xpk-test --tpu-type=v5litepod-16 --deploy-stacktrace-sidecar=true
```

## Workload Delete
* Workload Delete (delete training job):

Expand Down
115 changes: 98 additions & 17 deletions xpk.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,22 +96,7 @@
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: {args.docker_name}
image: {docker_image}
env: {args.env}
ports:
- containerPort: 8471
- containerPort: 8080
securityContext:
privileged: true
command:
- bash
- -c
- |
echo XPK Start: $(date) ; {command} ; EXIT_CODE=$? ; echo XPK End: $(date); echo EXIT_CODE=$EXIT_CODE ; sleep 5; exit $EXIT_CODE
resources:
limits:
google.com/tpu: {system.chips_per_vm}
{container}
"""

workload_delete_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
Expand Down Expand Up @@ -1517,6 +1502,85 @@ def setup_docker_image(args) -> tuple[int, str]:

return 0, docker_image

def get_main_and_sidecar_container(args, system, docker_image, command) -> str:
"""Generate yaml for main and sidecar container.
Args:
args: user provided arguments for running the command.
system: system characteristics
docker_image: docker image
command: command to run in the main container
Returns:
str:
yaml for main and sidecar container
"""
yaml = """- name: stacktrace-explorer
image: busybox:1.28
args: [/bin/sh, -c, "while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*"]
volumeMounts:
- name: tpu-stack-trace
readOnly: true
mountPath: /tmp/debugging
- name: {args.docker_name}
image: {docker_image}
env: {args.env}
ports:
- containerPort: 8471
- containerPort: 8080
securityContext:
privileged: true
command:
- bash
- -c
- |
echo XPK Start: $(date) ; {command} ; EXIT_CODE=$? ; echo XPK End: $(date); echo EXIT_CODE=$EXIT_CODE ; sleep 5; exit $EXIT_CODE
volumeMounts:
- name: tpu-stack-trace
mountPath: /tmp/debugging
resources:
limits:
google.com/tpu: {system.chips_per_vm}
volumes:
- name: tpu-stack-trace
"""
return yaml.format(args=args,
system=system,
docker_image=docker_image,
command=command)

def get_main_container(args, system, docker_image, command) -> str:
"""Generate yaml for main container.
Args:
args: user provided arguments for running the command.
system: system characteristics
docker_image: docker image
command: command to run in the main container
Returns:
str:
yaml for main container
"""
yaml = """- name: {args.docker_name}
image: {docker_image}
env: {args.env}
ports:
- containerPort: 8471
- containerPort: 8080
securityContext:
privileged: true
command:
- bash
- -c
- |
echo XPK Start: $(date) ; {command} ; EXIT_CODE=$? ; echo XPK End: $(date); echo EXIT_CODE=$EXIT_CODE ; sleep 5; exit $EXIT_CODE
resources:
limits:
google.com/tpu: {system.chips_per_vm}
"""
return yaml.format(args=args,
system=system,
docker_image=docker_image,
command=command)

def workload_create(args) -> int:
"""Run jobset apply command for a file.
Expand Down Expand Up @@ -1555,10 +1619,17 @@ def workload_create(args) -> int:
command += ('; WORKER_ID=$HOSTNAME;'
f'gsutil cp -r /tmp/xla_dump/ {args.debug_dump_gcs}/$WORKER_ID')

if args.deploy_stacktrace_sidecar == 'true':
xpk_print('Sidecar container to display stack traces will also be deployed.')
container = get_main_and_sidecar_container(args, system, docker_image, command)
else:
container = get_main_container(args, system, docker_image, command)

yml_string = workload_create_yaml.format(args=args,
system=system,
docker_image=docker_image,
command=command)
command=command,
container=container)
tmp = write_temporary_file(yml_string)
command = f'kubectl apply -f {str(tmp.file.name)}'

Expand Down Expand Up @@ -2215,6 +2286,16 @@ def directory_path_type(value):
),
)

workload_create_parser_optional_arguments.add_argument(
'--deploy-stacktrace-sidecar',
type=str,
default='false',
help=(
'Set this to true to deploy a sidecar container that will read '
'the stack traces collected in /tmp/debugging directory.'
),
)

workload_create_parser.set_defaults(func=workload_create)

# "job delete" command parser.
Expand Down

0 comments on commit 50dacd3

Please sign in to comment.