Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support try-me endpoints in Nomad #59

Merged
merged 18 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,9 @@ More details can be found in the [API docs](https://api.cloud.ai4eosc.eu/docs).
**Notes**: The catalog caches results for up to 6 hours to improve UX (see
[doctring](./ai4papi/routers/v1/modules.py)).

* `/v1/try_me/`:
endpoint where anyone can deploy a short-lived container to try a module

* `/v1/deployments/`: (🔒)
deploy modules/tools in the platform to perform trainings

Expand Down
5 changes: 5 additions & 0 deletions ai4papi/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ def load_yaml_conf(fpath):
}
}

# Try-me endpoints
nmd = load_nomad_job(paths['conf'] / 'try_me' / 'nomad.hcl')
TRY_ME = {
'nomad': nmd,
}

# Retrieve git info from PAPI, to show current version in the docs
papi_commit = subprocess.run(
Expand Down
3 changes: 2 additions & 1 deletion ai4papi/routers/v1/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import fastapi

from . import catalog, deployments, secrets, stats
from . import catalog, deployments, secrets, stats, try_me

app = fastapi.APIRouter()
app.include_router(catalog.app)
app.include_router(deployments.app)
app.include_router(secrets.router)
app.include_router(stats.app)
app.include_router(try_me.app)


@app.get(
Expand Down
3 changes: 2 additions & 1 deletion ai4papi/routers/v1/stats/deployments.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def get_cluster_stats(
for k, v in n_stats.items():

# Ignore keys
if k in ['name', 'namespaces', 'eligibility', 'status']:
if k in ['name', 'namespaces', 'eligibility', 'status', 'tags']:
continue

# Aggregate nested gpu_models dict
Expand Down Expand Up @@ -286,6 +286,7 @@ def get_cluster_stats_bg():
n_stats['gpu_models'] = {}
n_stats['namespaces'] = node['Meta'].get('namespace', '')
n_stats['status'] = node['Meta'].get('status', '')
n_stats['tags'] = node['Meta'].get('tags', '')

if n['NodeResources']['Devices']:
for devices in n['NodeResources']['Devices']:
Expand Down
10 changes: 10 additions & 0 deletions ai4papi/routers/v1/try_me/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import fastapi

from . import nomad


app = fastapi.APIRouter()
app.include_router(
router=nomad.router,
prefix='/try_me',
)
135 changes: 135 additions & 0 deletions ai4papi/routers/v1/try_me/nomad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
from copy import deepcopy
import uuid

from fastapi import APIRouter, Depends, HTTPException
from fastapi.security import HTTPBearer

from ai4papi import auth
import ai4papi.conf as papiconf
from ai4papi.routers.v1.catalog.modules import Modules
from ai4papi.routers.v1.stats.deployments import get_cluster_stats
import ai4papi.nomad.common as nomad


router = APIRouter(
prefix="/nomad",
tags=["Nomad trials"],
responses={404: {"description": "Not found"}},
)
security = HTTPBearer()


@router.post("/")
def create_deployment(
module_name: str,
authorization=Depends(security),
):
"""
Submit a try-me deployment to Nomad.
The deployment will automatically kill himself after a short amount of time.

This endpoint is meant to be public for everyone to try (no authorization required).
We deploy jobs by default in the AI4EOSC namespace.

Returns a string with the endpoint to access the API.
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)

# Retrieve docker_image from module_name
meta = Modules.get_metadata(module_name)
docker_image = meta['sources']['docker_registry_repo']

# Load module configuration
nomad_conf = deepcopy(papiconf.TRY_ME['nomad'])

# Generate UUID from (MAC address+timestamp) so it's unique
job_uuid = uuid.uuid1()

# Replace the Nomad job template
nomad_conf = nomad_conf.safe_substitute(
{
'JOB_UUID': job_uuid,
'NAMESPACE': 'ai4eosc', # (!) try-me jobs are always deployed in "ai4eosc"
'OWNER': auth_info['id'],
'OWNER_NAME': auth_info['name'],
'OWNER_EMAIL': auth_info['email'],
'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'], # idem
'HOSTNAME': job_uuid,
'DOCKER_IMAGE': docker_image,
}
)

# Convert template to Nomad conf
nomad_conf = nomad.load_job_conf(nomad_conf)

# Check that the target node (ie. tag='tryme') resources are available because
# these jobs cannot be left queueing
# We check for every resource metric (cpu, disk, ram)
stats = get_cluster_stats(vo='vo.ai4eosc.eu')
resources = ['cpu', 'ram', 'disk']
keys = [f"{i}_used" for i in resources] + [f"{i}_total" for i in resources]
status = {k: 0 for k in keys}

for _, datacenter in stats['datacenters'].items():
for _, node in datacenter['nodes'].items():
if 'tryme' in node['tags']:
for k in keys:
status[k] += node[k]
for r in resources:
if status[f"{r}_used"] / status[f"{r}_total"] > 0.95:
raise HTTPException(
status_code=503,
detail="Sorry, but there seem to be no resources available right " \
"now to test the module. Please try later.",
)

# Check that the user hasn't too many "try-me" jobs currently running
jobs = nomad.get_deployments(
namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc"
owner=auth_info['id'],
prefix="try",
)
if len(jobs) >= 2:
raise HTTPException(
status_code=503,
detail="Sorry, but you seem to be currently running two `Try-me` environments already. " \
"Before launching a new one, you will need to wait till one of your " \
"existing environments gets automatically deleted (ca. 10 min)."
)

# Submit job
r = nomad.create_deployment(nomad_conf)

return r


@router.get("/{deployment_uuid}")
def get_deployment(
deployment_uuid: str,
authorization=Depends(security),
):
"""
This function is used mainly to be able to retrieve the endpoint of the try_me job.
We cannot return the endpoint when creating the job, because the final endpoint will
on which datacenter the job ends up landing.

Parameters:
* **deployment_uuid**: uuid of deployment to gather info about

Returns a dict with info
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)

job = nomad.get_deployment(
deployment_uuid=deployment_uuid,
namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc"
owner=auth_info['id'],
full_info=True,
)

# Rewrite main endpoint, otherwise it automatically selects DEEPaaS API
job['main_endpoint'] = 'ui'

return job
154 changes: 154 additions & 0 deletions etc/try_me/nomad.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
/*
Convention:
-----------
* ${UPPERCASE} are replaced by the user
* ${lowercase} are replace by Nomad at launchtime
* remaining is default, same for everybody

When replacing user values we use safe_substitute() so that ge don't get an error for not
replacing Nomad values
*/

job "try-${JOB_UUID}" {
namespace = "${NAMESPACE}"
type = "batch" # try-me jobs should not be redeployed when exit_code=0
region = "global"
id = "${JOB_UUID}"
priority = "0" # try-me jobs have low priority

meta {
owner = "${OWNER}" # user-id from OIDC
owner_name = "${OWNER_NAME}"
owner_email = "${OWNER_EMAIL}"
title = ""
description = ""
}

# Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready)
constraint {
attribute = "${meta.status}"
operator = "regexp"
value = "ready"
}

# Only deploy in nodes serving that namespace (we use metadata instead of node-pools
# because Nomad does not allow a node to belong to several node pools)
constraint {
attribute = "${meta.namespace}"
operator = "regexp"
value = "${NAMESPACE}"
}

# Force that try-me jobs land in "tryme" nodes (that are the ones that have the docker
# images pre-fetched for fast deployment)
constraint {
attribute = "${meta.tags}"
operator = "regexp"
value = "tryme"
}

group "usergroup" {

# Do not try to restart a try-me job if it raised an error (eg. module incompatible
# with Gradio UI)
reschedule {
attempts = 0
unlimited = false
}

network {

port "ui" {
to = 80 # -1 will assign random port
}
port "api" {
to = 5000 # -1 will assign random port
}
}

service {
name = "${JOB_UUID}-ui"
port = "ui"
tags = [
"traefik.enable=true",
"traefik.http.routers.${JOB_UUID}-ui.tls=true",
"traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)",
]
}

ephemeral_disk {
size = 300 # MB
}

task "main" { # DEEPaaS API

# Run as a prestart task to make sure deepaas has already launched when launching the deepaas UI
lifecycle {
hook = "prestart"
sidecar = true
}

driver = "docker"

config {
force_pull = true
image = "${DOCKER_IMAGE}:latest"
command = "deep-start"
args = ["--deepaas"]
ports = ["api"]
shm_size = 1000000000 # 1GB
memory_hard_limit = 2000 # 2GB
}

# (!) Keep in mind that if a module works locally but isn't working in Nomad,
# the reason is likely that these resources are too low and the module freezes
resources {
cores = 1
memory = 2000 # 2GB
memory_max = 2000 # 2GB
}

# Do not try to restart a try-me job if it failis to launch deepaas
# This is usually due to the fact that the Docker image took too long to download
# and failed with error: `Failed to pull `ai4oshub/...`: context deadline` exceeded
# Restarting in the same node won't fix the connectivity issues
restart {
attempts = 0
mode = "fail"
}

}

task "ui" { # DEEPaaS UI (Gradio)

driver = "docker"

config {
force_pull = true
image = "registry.services.ai4os.eu/ai4os/deepaas_ui:latest"
ports = ["ui"]
shm_size = 250000000 # 250MB
memory_hard_limit = 500 # MB
}

env {
DURATION = "10m" # kill job after 10 mins
UI_PORT = 80
}

resources {
cpu = 500 # MHz
memory = 500 # MB
memory_max = 500 # MB
}

# Do not try to restart a try-me job if it raises error (module incompatible with Gradio UI)
restart {
attempts = 0
mode = "fail"
}

}

}
}
Loading