diff --git a/README.md b/README.md index 7762150..65936f4 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,9 @@ More details can be found in the [API docs](https://api.cloud.ai4eosc.eu/docs). **Notes**: The catalog caches results for up to 6 hours to improve UX (see [doctring](./ai4papi/routers/v1/modules.py)). +* `/v1/try_me/`: + endpoint where anyone can deploy a short-lived container to try a module + * `/v1/deployments/`: (🔒) deploy modules/tools in the platform to perform trainings diff --git a/ai4papi/conf.py b/ai4papi/conf.py index 09fa7a4..5b58ba6 100644 --- a/ai4papi/conf.py +++ b/ai4papi/conf.py @@ -81,6 +81,11 @@ def load_yaml_conf(fpath): } } +# Try-me endpoints +nmd = load_nomad_job(paths['conf'] / 'try_me' / 'nomad.hcl') +TRY_ME = { + 'nomad': nmd, +} # Retrieve git info from PAPI, to show current version in the docs papi_commit = subprocess.run( diff --git a/ai4papi/routers/v1/__init__.py b/ai4papi/routers/v1/__init__.py index 02ce14e..d35c40f 100644 --- a/ai4papi/routers/v1/__init__.py +++ b/ai4papi/routers/v1/__init__.py @@ -1,12 +1,13 @@ import fastapi -from . import catalog, deployments, secrets, stats +from . import catalog, deployments, secrets, stats, try_me app = fastapi.APIRouter() app.include_router(catalog.app) app.include_router(deployments.app) app.include_router(secrets.router) app.include_router(stats.app) +app.include_router(try_me.app) @app.get( diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index 6c579f3..4383b04 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -223,7 +223,7 @@ def get_cluster_stats( for k, v in n_stats.items(): # Ignore keys - if k in ['name', 'namespaces', 'eligibility', 'status']: + if k in ['name', 'namespaces', 'eligibility', 'status', 'tags']: continue # Aggregate nested gpu_models dict @@ -286,6 +286,7 @@ def get_cluster_stats_bg(): n_stats['gpu_models'] = {} n_stats['namespaces'] = node['Meta'].get('namespace', '') n_stats['status'] = node['Meta'].get('status', '') + n_stats['tags'] = node['Meta'].get('tags', '') if n['NodeResources']['Devices']: for devices in n['NodeResources']['Devices']: diff --git a/ai4papi/routers/v1/try_me/__init__.py b/ai4papi/routers/v1/try_me/__init__.py new file mode 100644 index 0000000..a86c86b --- /dev/null +++ b/ai4papi/routers/v1/try_me/__init__.py @@ -0,0 +1,10 @@ +import fastapi + +from . import nomad + + +app = fastapi.APIRouter() +app.include_router( + router=nomad.router, + prefix='/try_me', + ) diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py new file mode 100644 index 0000000..ef560fb --- /dev/null +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -0,0 +1,135 @@ +from copy import deepcopy +import uuid + +from fastapi import APIRouter, Depends, HTTPException +from fastapi.security import HTTPBearer + +from ai4papi import auth +import ai4papi.conf as papiconf +from ai4papi.routers.v1.catalog.modules import Modules +from ai4papi.routers.v1.stats.deployments import get_cluster_stats +import ai4papi.nomad.common as nomad + + +router = APIRouter( + prefix="/nomad", + tags=["Nomad trials"], + responses={404: {"description": "Not found"}}, +) +security = HTTPBearer() + + +@router.post("/") +def create_deployment( + module_name: str, + authorization=Depends(security), + ): + """ + Submit a try-me deployment to Nomad. + The deployment will automatically kill himself after a short amount of time. + + This endpoint is meant to be public for everyone to try (no authorization required). + We deploy jobs by default in the AI4EOSC namespace. + + Returns a string with the endpoint to access the API. + """ + # Retrieve authenticated user info + auth_info = auth.get_user_info(token=authorization.credentials) + + # Retrieve docker_image from module_name + meta = Modules.get_metadata(module_name) + docker_image = meta['sources']['docker_registry_repo'] + + # Load module configuration + nomad_conf = deepcopy(papiconf.TRY_ME['nomad']) + + # Generate UUID from (MAC address+timestamp) so it's unique + job_uuid = uuid.uuid1() + + # Replace the Nomad job template + nomad_conf = nomad_conf.safe_substitute( + { + 'JOB_UUID': job_uuid, + 'NAMESPACE': 'ai4eosc', # (!) try-me jobs are always deployed in "ai4eosc" + 'OWNER': auth_info['id'], + 'OWNER_NAME': auth_info['name'], + 'OWNER_EMAIL': auth_info['email'], + 'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'], # idem + 'HOSTNAME': job_uuid, + 'DOCKER_IMAGE': docker_image, + } + ) + + # Convert template to Nomad conf + nomad_conf = nomad.load_job_conf(nomad_conf) + + # Check that the target node (ie. tag='tryme') resources are available because + # these jobs cannot be left queueing + # We check for every resource metric (cpu, disk, ram) + stats = get_cluster_stats(vo='vo.ai4eosc.eu') + resources = ['cpu', 'ram', 'disk'] + keys = [f"{i}_used" for i in resources] + [f"{i}_total" for i in resources] + status = {k: 0 for k in keys} + + for _, datacenter in stats['datacenters'].items(): + for _, node in datacenter['nodes'].items(): + if 'tryme' in node['tags']: + for k in keys: + status[k] += node[k] + for r in resources: + if status[f"{r}_used"] / status[f"{r}_total"] > 0.95: + raise HTTPException( + status_code=503, + detail="Sorry, but there seem to be no resources available right " \ + "now to test the module. Please try later.", + ) + + # Check that the user hasn't too many "try-me" jobs currently running + jobs = nomad.get_deployments( + namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc" + owner=auth_info['id'], + prefix="try", + ) + if len(jobs) >= 2: + raise HTTPException( + status_code=503, + detail="Sorry, but you seem to be currently running two `Try-me` environments already. " \ + "Before launching a new one, you will need to wait till one of your " \ + "existing environments gets automatically deleted (ca. 10 min)." + ) + + # Submit job + r = nomad.create_deployment(nomad_conf) + + return r + + +@router.get("/{deployment_uuid}") +def get_deployment( + deployment_uuid: str, + authorization=Depends(security), + ): + """ + This function is used mainly to be able to retrieve the endpoint of the try_me job. + We cannot return the endpoint when creating the job, because the final endpoint will + on which datacenter the job ends up landing. + + Parameters: + * **deployment_uuid**: uuid of deployment to gather info about + + Returns a dict with info + """ + # Retrieve authenticated user info + auth_info = auth.get_user_info(token=authorization.credentials) + + job = nomad.get_deployment( + deployment_uuid=deployment_uuid, + namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc" + owner=auth_info['id'], + full_info=True, + ) + + # Rewrite main endpoint, otherwise it automatically selects DEEPaaS API + job['main_endpoint'] = 'ui' + + return job diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl new file mode 100644 index 0000000..d11b580 --- /dev/null +++ b/etc/try_me/nomad.hcl @@ -0,0 +1,154 @@ +/* +Convention: +----------- +* ${UPPERCASE} are replaced by the user +* ${lowercase} are replace by Nomad at launchtime +* remaining is default, same for everybody + +When replacing user values we use safe_substitute() so that ge don't get an error for not +replacing Nomad values +*/ + +job "try-${JOB_UUID}" { + namespace = "${NAMESPACE}" + type = "batch" # try-me jobs should not be redeployed when exit_code=0 + region = "global" + id = "${JOB_UUID}" + priority = "0" # try-me jobs have low priority + + meta { + owner = "${OWNER}" # user-id from OIDC + owner_name = "${OWNER_NAME}" + owner_email = "${OWNER_EMAIL}" + title = "" + description = "" + } + + # Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready) + constraint { + attribute = "${meta.status}" + operator = "regexp" + value = "ready" + } + + # Only deploy in nodes serving that namespace (we use metadata instead of node-pools + # because Nomad does not allow a node to belong to several node pools) + constraint { + attribute = "${meta.namespace}" + operator = "regexp" + value = "${NAMESPACE}" + } + + # Force that try-me jobs land in "tryme" nodes (that are the ones that have the docker + # images pre-fetched for fast deployment) + constraint { + attribute = "${meta.tags}" + operator = "regexp" + value = "tryme" + } + + group "usergroup" { + + # Do not try to restart a try-me job if it raised an error (eg. module incompatible + # with Gradio UI) + reschedule { + attempts = 0 + unlimited = false + } + + network { + + port "ui" { + to = 80 # -1 will assign random port + } + port "api" { + to = 5000 # -1 will assign random port + } + } + + service { + name = "${JOB_UUID}-ui" + port = "ui" + tags = [ + "traefik.enable=true", + "traefik.http.routers.${JOB_UUID}-ui.tls=true", + "traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", + ] + } + + ephemeral_disk { + size = 300 # MB + } + + task "main" { # DEEPaaS API + + # Run as a prestart task to make sure deepaas has already launched when launching the deepaas UI + lifecycle { + hook = "prestart" + sidecar = true + } + + driver = "docker" + + config { + force_pull = true + image = "${DOCKER_IMAGE}:latest" + command = "deep-start" + args = ["--deepaas"] + ports = ["api"] + shm_size = 1000000000 # 1GB + memory_hard_limit = 2000 # 2GB + } + + # (!) Keep in mind that if a module works locally but isn't working in Nomad, + # the reason is likely that these resources are too low and the module freezes + resources { + cores = 1 + memory = 2000 # 2GB + memory_max = 2000 # 2GB + } + + # Do not try to restart a try-me job if it failis to launch deepaas + # This is usually due to the fact that the Docker image took too long to download + # and failed with error: `Failed to pull `ai4oshub/...`: context deadline` exceeded + # Restarting in the same node won't fix the connectivity issues + restart { + attempts = 0 + mode = "fail" + } + + } + + task "ui" { # DEEPaaS UI (Gradio) + + driver = "docker" + + config { + force_pull = true + image = "registry.services.ai4os.eu/ai4os/deepaas_ui:latest" + ports = ["ui"] + shm_size = 250000000 # 250MB + memory_hard_limit = 500 # MB + } + + env { + DURATION = "10m" # kill job after 10 mins + UI_PORT = 80 + } + + resources { + cpu = 500 # MHz + memory = 500 # MB + memory_max = 500 # MB + } + + # Do not try to restart a try-me job if it raises error (module incompatible with Gradio UI) + restart { + attempts = 0 + mode = "fail" + } + + } + + } +}