diff --git a/dial-docker-compose/ci/ollama/test/app.py b/dial-docker-compose/ci/ollama/test/app.py index d3c133d1..6f805f43 100644 --- a/dial-docker-compose/ci/ollama/test/app.py +++ b/dial-docker-compose/ci/ollama/test/app.py @@ -1,9 +1,13 @@ +import base64 import os +from pathlib import Path import aiohttp import asyncio import backoff import logging +import time +from contextlib import asynccontextmanager def get_env(name: str) -> str: @@ -21,6 +25,15 @@ def get_env(name: str) -> str: log = logging.getLogger(__name__) +@asynccontextmanager +async def timer(name: str): + log.debug(f"[{name}] Starting...") + start = time.perf_counter() + yield + elapsed = time.perf_counter() - start + log.debug(f"[{name}] Executed in {elapsed:.2f} seconds") + + @backoff.on_exception( backoff.expo, (aiohttp.ClientError, aiohttp.ServerTimeoutError), @@ -35,13 +48,15 @@ async def post_with_retry(url: str, payload: dict, headers: dict, params: dict): return await response.json() -async def test_model(deployment_id: str): +def read_image_base64(png_file: Path) -> str: + return base64.b64encode(png_file.read_bytes()).decode("utf-8") + +async def dial_chat_completion(deployment_id: str, messages: list) -> str: api_url = f"{DIAL_URL}/openai/deployments/{deployment_id}/chat/completions" - message = "12 + 23 = ? Reply with a single number:" payload = { "model": deployment_id, - "messages": [{"role": "user", "content": message}], + "messages": messages, "stream": False, } headers = {"api-key": DIAL_API_KEY} @@ -52,12 +67,44 @@ async def test_model(deployment_id: str): content = body.get("choices", [])[0].get("message", {}).get("content", "") - if "35" not in content: - raise ValueError(f"Test failed for {deployment_id!r}. ") + log.debug(f"Content: {content}") + + return content + +async def test_chat_model(deployment_id: str): + message = "2 + 3 = ? Reply with a single number:" + messages = [{"role": "user", "content": message}] + content = await dial_chat_completion(deployment_id, messages) + + if "5" not in content: + raise ValueError(f"Test failed for {deployment_id!r}") +async def test_vision_model(deployment_id: str): + base64_data = read_image_base64(Path("./image.png")) + base64_image = f"data:image/png;base64,{base64_data}" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe the image"}, + {"type": "image_url", "image_url": {"url": base64_image}}, + ], + } + ] + + content = await dial_chat_completion(deployment_id, messages) + + if "vision" not in content.lower(): + raise ValueError(f"Test failed for {deployment_id!r}") + async def tests(): - await test_model("ollama-model") + async with timer("Testing chat-model"): + await test_chat_model("chat-model") + + async with timer("Testing vision-model"): + await test_vision_model("vision-model") if __name__ == "__main__": diff --git a/dial-docker-compose/ci/ollama/test/image.png b/dial-docker-compose/ci/ollama/test/image.png new file mode 100644 index 00000000..bfbcfc7e Binary files /dev/null and b/dial-docker-compose/ci/ollama/test/image.png differ diff --git a/dial-docker-compose/ollama/core/config.json b/dial-docker-compose/ollama/core/config.json index 1a83c364..4a55bd1e 100644 --- a/dial-docker-compose/ollama/core/config.json +++ b/dial-docker-compose/ollama/core/config.json @@ -1,10 +1,25 @@ { "routes": {}, "models": { - "ollama-model": { + "chat-model": { "type": "chat", - "displayName": "Self-hosted model", + "displayName": "Self-hosted chat model", "endpoint": "http://ollama:11434/v1/chat/completions" + }, + "vision-model": { + "type": "chat", + "displayName": "Self-hosted vision model", + "endpoint": "http://adapter-openai:5000/openai/deployments/vision-model/chat/completions", + "inputAttachmentTypes": [ + "image/png", + "image/jpeg" + ], + "upstreams": [ + { + "endpoint": "http://ollama:11434/v1/chat/completions", + "key": "dummy-key" + } + ] } }, "keys": { @@ -16,8 +31,9 @@ "roles": { "default": { "limits": { - "ollama-model": {} + "chat-model": {}, + "vision-model": {} } } } -} +} \ No newline at end of file diff --git a/dial-docker-compose/ollama/docker-compose.yml b/dial-docker-compose/ollama/docker-compose.yml index 7e1e8ab2..9712aa6a 100644 --- a/dial-docker-compose/ollama/docker-compose.yml +++ b/dial-docker-compose/ollama/docker-compose.yml @@ -14,15 +14,21 @@ services: depends_on: ollama: condition: service_started - image: alpine:3.20.3 + build: ./ollama_setup environment: - OLLAMA_URL=http://ollama:11434 - - OLLAMA_MODEL=${OLLAMA_MODEL:-llama3.1:8b-instruct-q4_K_M} - volumes: - - ./ollama_setup.sh:/setup.sh - command: sh /setup.sh + - OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL} + - OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL} healthcheck: test: ["CMD", "test", "-f", "/healthy"] interval: 10s start_period: 10s retries: 10 + + adapter-openai: + image: epam/ai-dial-adapter-openai:0.14.0 + environment: + WEB_CONCURRENCY: "3" + DIAL_URL: "http://core:8080" + DIAL_USE_FILE_STORAGE: "True" + GPT4_VISION_DEPLOYMENTS: "vision-model" \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup.sh b/dial-docker-compose/ollama/ollama_setup.sh deleted file mode 100755 index d45c4c89..00000000 --- a/dial-docker-compose/ollama/ollama_setup.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -set -e - -apk add --no-cache curl - -until curl -s ${OLLAMA_URL}; do - echo "Waiting for Ollama..." - sleep 5 -done - -echo "Pulling $OLLAMA_URL..." -curl -vL --fail-with-body "$OLLAMA_URL/api/pull" -d "{\"name\": \"$OLLAMA_MODEL\", \"stream\": false}" - -OLLAMA_MODEL_ALIAS=ollama-model - -echo "Making alias for $OLLAMA_URL: $OLLAMA_MODEL_ALIAS..." -curl -vL --fail-with-body "$OLLAMA_URL/api/copy" -d "{\"source\": \"$OLLAMA_MODEL\", \"destination\": \"$OLLAMA_MODEL_ALIAS\"}" - -echo "Loading the model into memory..." -curl -vL --fail-with-body "$OLLAMA_URL/api/generate" -d "{\"model\": \"$OLLAMA_MODEL_ALIAS\"}" - -touch /healthy - -tail -f /dev/null \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/Dockerfile b/dial-docker-compose/ollama/ollama_setup/Dockerfile new file mode 100644 index 00000000..3afdb789 --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/Dockerfile @@ -0,0 +1,4 @@ +FROM python:3.11-alpine +RUN pip install requests +COPY setup.py /setup.py +CMD ["sh", "-c", "python /setup.py && tail -f /dev/null"] diff --git a/dial-docker-compose/ollama/ollama_setup/setup.py b/dial-docker-compose/ollama/ollama_setup/setup.py new file mode 100755 index 00000000..85e26858 --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/setup.py @@ -0,0 +1,87 @@ +import asyncio +from contextlib import asynccontextmanager +import logging +import os +import time +import requests + +OLLAMA_URL = os.getenv("OLLAMA_URL") +if OLLAMA_URL is None: + raise RuntimeError("OLLAMA_URL env var isn't set") + +OLLAMA_CHAT_MODEL = os.getenv("OLLAMA_CHAT_MODEL") +OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL") + +HEALTH_FILE = "/healthy" + +log = logging.getLogger(__name__) +log.setLevel(logging.INFO) + +log.info(f"OLLAMA_URL = {OLLAMA_URL}") +log.info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}") +log.info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}") + + +@asynccontextmanager +async def timer(name: str): + log.info(f"[{name}] Starting...") + start = time.perf_counter() + yield + elapsed = time.perf_counter() - start + log.info(f"[{name}] Executed in {elapsed:.2f} seconds") + + +async def wait_for_ollama(): + while True: + try: + if requests.get(OLLAMA_URL).ok: + break + except requests.RequestException: + pass + await asyncio.sleep(1) + + +async def pull_model(model): + data = {"name": model, "stream": False} + requests.post(f"{OLLAMA_URL}/api/pull", json=data).raise_for_status() + + +async def create_alias(source, dest): + data = {"source": source, "destination": dest} + requests.post(f"{OLLAMA_URL}/api/copy", json=data).raise_for_status() + + +async def load_model(model): + data = {"model": model} + requests.post(f"{OLLAMA_URL}/api/generate", json=data).raise_for_status() + + +async def mark_as_healthy(): + open(HEALTH_FILE, "w").close() + + +async def main(): + async with timer("Waiting for Ollama to start"): + await wait_for_ollama() + + for model, alias in [ + (OLLAMA_CHAT_MODEL, "chat-model"), + (OLLAMA_VISION_MODEL, "vision-model"), + ]: + if model: + async with timer(f"Pulling model {model}"): + await pull_model(model) + + async with timer(f"Creating alias for {model}: {alias}"): + await create_alias(model, alias) + + if model_to_load := OLLAMA_CHAT_MODEL or OLLAMA_VISION_MODEL: + async with timer(f"Loading model {model_to_load} into memory"): + await load_model(model_to_load) + + await mark_as_healthy() + + +if __name__ == "__main__": + loop = asyncio.get_event_loop() + loop.run_until_complete(main()) diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index 260b1dce..c850eec3 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -12,20 +12,57 @@ Docker engine installed on your machine (Docker Compose Version 2.20.0 +). ## Step 1: Get AI DIAL -[Download](https://github.com/epam/ai-dial/tree/main/dial-docker-compose/ollama/) AI DIAL. +Clone [the repository](https://github.com/epam/ai-dial/) with the tutorials and change directory to the following folder: -## Step 2: Launch AI DIAL Chat +```sh +cd dial-docker-compose/ollama +``` -Run `docker compose up` from the folder with the [docker-compose file](https://github.com/epam/ai-dial/blob/main/dial-docker-compose/ollama/docker-compose.yml). +## Step 2: Choose a model to run -> By default [llama3.1:8b-instruct-q4_K_M](https://ollama.com/library/llama3.1:8b-instruct-q4_K_M) model will be pulled and loaded into the memory of the Ollama server automatically. This is going to take around a minute on the first run. -> -> You could specify the model via the environment variable `OLLAMA_MODEL`: -> -> ```sh -> OLLAMA_MODEL=model_of_your_choice docker compose up -> ``` -> -> Find the available models at the [Ollama model library](https://ollama.com/library). +Ollama supports a wide range of popular open-source models. -Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and chat with the model. \ No newline at end of file +Consider first the modality your interested in - is a regular text-to-text chat model or a multi-modal vision model? + +Follow the feature tags at https://ollama.com/search to find an appropriate model, e.g. one working with `Code`, supporting `Tools` or a `Vision` feature. + +We recommend to choose one of the following models which have been tested: + +|Model|Vision|Tools|Streaming| +|----|----|----|----| +|[llama3.1:8b-instruct-q4_0](https://ollama.com/library/llama3.1:8b-instruct-q4_0)|❌|✅ *(only in non-streaming mode)*|✅| +|[mistral:7b-instruct-q4_0](https://ollama.com/library/mistral:7b-instruct-q4_0)|❌|❌|✅| +|[phi3.5:3.8b-mini-instruct-q4_0](https://ollama.com/library/phi3.5:3.8b-mini-instruct-q4_0)|❌|❌|✅| +|[gemma2:2b-instruct-q4_0](https://ollama.com/library/gemma2:2b-instruct-q4_0)|❌|❌|✅| +|[llava:7b-v1.6-mistral-q4_0](https://ollama.com/library/llava:7b-v1.6-mistral-q4_0)|✅|❌|❌| +|[llava-phi3:3.8b-mini-q4_0](https://ollama.com/library/llava-phi3:3.8b-mini-q4_0)|✅|❌|❌| + +All the listed models support streaming. + +## Step 3: Launch AI DIAL Chat + +### Chat model + +If you have chosen a regular chat model _(e.g. llama3.1:8b-instruct-q4_0)_, then run the command: + +```sh +OLLAMA_CHAT_MODEL=model_of_your_choice docker compose up --abort-on-container-exit +``` + +The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run. + +Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted chat model` deployment to converse with the model. + +### Vision model + +If you have chosen a vision model _(e.g. llava-phi3:3.8b-mini-q4_0)_, then run the command: + +```sh +OLLAMA_VISION_MODEL=model_of_your_choice docker compose up --abort-on-container-exit +``` + +The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run. + +Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted vision model` deployment to converse with the model. + +> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond. \ No newline at end of file