Skip to content

Commit

Permalink
fix: allow purging stuck Nomad jobs (status=queued)
Browse files Browse the repository at this point in the history
  • Loading branch information
IgnacioHeredia committed Aug 19, 2024
1 parent c5bca42 commit 21be7a2
Showing 1 changed file with 17 additions and 18 deletions.
35 changes: 17 additions & 18 deletions ai4papi/nomad/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,30 +336,29 @@ def delete_deployment(
Returns a dict with status
"""
# Check the deployment exists
try:
j = Nomad.job.get_job(
id_=deployment_uuid,
namespace=namespace,
)
except exceptions.URLNotFoundNomadException:
raise HTTPException(
status_code=400,
detail="No deployment exists with this uuid.",
)
# Retrieve the deployment information. Under-the-hood it checks that:
# - the job indeed exists
# - the owner does indeed own the job
info = get_deployment(
deployment_uuid=deployment_uuid,
namespace=namespace,
owner=owner,
full_info=False,
)

# Check job does belong to owner
if j['Meta'] and owner != j['Meta'].get('owner', ''):
raise HTTPException(
status_code=400,
detail="You are not the owner of that deployment.",
)
# If job is in "queued" status, allow deleting with purge.
# Most of the time, when a job is in this status, it is due to a platform error.
# It gets stuck and cannot be deleted without purge
if info['status'] == 'queued':
purge = True
else:
purge = False

# Delete deployment
Nomad.job.deregister_job(
id_=deployment_uuid,
namespace=namespace,
purge=False,
purge=purge,
)

return {'status': 'success'}
Expand Down

0 comments on commit 21be7a2

Please sign in to comment.