From fad97f6f4501cc8f57c983fe679557fec54267f0 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Wed, 12 Jul 2023 03:19:45 +0300 Subject: [PATCH] Add events to kubectl.wait() and kubectl.rollout() errors Most errors are timeouts, and the error from kubectl is not helpful: error: timed out waiting for the condition on deployments/nginx Try to add events from the relevant resource to help debugging the issue. Here is an example when running a self test in a loop, reaching Docker Hub rate limit: drenv.commands.Error: Command failed: command: ('kubectl', 'wait', '--context', 'dr1', 'deploy/nginx', '--for=condition=Available', '--namespace=ns1', '--timeout=120s') exitcode: 1 error: error: timed out waiting for the condition on deployments/nginx events: LAST SEEN TYPE REASON OBJECT MESSAGE 36m (x27 over 151m) Warning Failed Pod/nginx-7f456874f4-kqljw Failed to pull image "nginx": rpc error: code = Unknown desc = failed to pull and unpack image "docker.io/library/nginx:latest": ...: 429 Too Many Requests - Server message: toomanyrequests: You have reached your pull rate limit. You may increase the limit by authenticating and upgrading: https://www.docker.com/increase-rate-limit 26m (x29 over 151m) Normal Pulling Pod/nginx-7f456874f4-kqljw Pulling image "nginx" 85s (x648 over 151m) Normal BackOff Pod/nginx-7f456874f4-kqljw Back-off pulling image "nginx" Signed-off-by: Nir Soffer --- test/drenv/kubectl.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/test/drenv/kubectl.py b/test/drenv/kubectl.py index 6470ce7f6f..80e652ee05 100644 --- a/test/drenv/kubectl.py +++ b/test/drenv/kubectl.py @@ -93,7 +93,13 @@ def rollout(action, resource, timeout=300, namespace=None, context=None, log=pri args = [action, resource, f"--timeout={timeout}s"] if namespace: args.append(f"--namespace={namespace}") - _watch("rollout", *args, context=context, log=log) + try: + _watch("rollout", *args, context=context, log=log) + except commands.Error as e: + # Most failures are timeouts, events may help to debug. + if action == "status": + e.events = _try_events(resource, namespace=namespace, context=context) + raise def wait( @@ -123,7 +129,21 @@ def wait( args.append(f"--for={condition}") if namespace: args.append(f"--namespace={namespace}") - _watch("wait", *args, context=context, log=log) + + try: + _watch("wait", *args, context=context, log=log) + except commands.Error as e: + # Most failures are timeouts, events may help to debug. + if resource and not (all or selector): + e.events = _try_events(resource, namespace=namespace, context=context) + raise + + +def _try_events(resource, namespace=None, context=None): + try: + return events(resource, namespace=namespace, context=context) + except Exception as e: + return f"(error getting events: {e})" def _run(cmd, *args, context=None):