Skip to content

Commit

Permalink
feat: added instructions for self-hosted vision models
Browse files Browse the repository at this point in the history
  • Loading branch information
adubovik committed Sep 17, 2024
1 parent 626b1d6 commit c51418d
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 52 deletions.
59 changes: 53 additions & 6 deletions dial-docker-compose/ci/ollama/test/app.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import base64
import os
from pathlib import Path
import aiohttp
import asyncio
import backoff

import logging
import time
from contextlib import asynccontextmanager


def get_env(name: str) -> str:
Expand All @@ -21,6 +25,15 @@ def get_env(name: str) -> str:
log = logging.getLogger(__name__)


@asynccontextmanager
async def timer(name: str):
log.debug(f"[{name}] Starting...")
start = time.perf_counter()
yield
elapsed = time.perf_counter() - start
log.debug(f"[{name}] Executed in {elapsed:.2f} seconds")


@backoff.on_exception(
backoff.expo,
(aiohttp.ClientError, aiohttp.ServerTimeoutError),
Expand All @@ -35,13 +48,15 @@ async def post_with_retry(url: str, payload: dict, headers: dict, params: dict):
return await response.json()


async def test_model(deployment_id: str):
def read_image_base64(png_file: Path) -> str:
return base64.b64encode(png_file.read_bytes()).decode("utf-8")

async def dial_chat_completion(deployment_id: str, messages: list) -> str:
api_url = f"{DIAL_URL}/openai/deployments/{deployment_id}/chat/completions"

message = "12 + 23 = ? Reply with a single number:"
payload = {
"model": deployment_id,
"messages": [{"role": "user", "content": message}],
"messages": messages,
"stream": False,
}
headers = {"api-key": DIAL_API_KEY}
Expand All @@ -52,12 +67,44 @@ async def test_model(deployment_id: str):

content = body.get("choices", [])[0].get("message", {}).get("content", "")

if "35" not in content:
raise ValueError(f"Test failed for {deployment_id!r}. ")
log.debug(f"Content: {content}")

return content

async def test_chat_model(deployment_id: str):
message = "2 + 3 = ? Reply with a single number:"
messages = [{"role": "user", "content": message}]
content = await dial_chat_completion(deployment_id, messages)

if "5" not in content:
raise ValueError(f"Test failed for {deployment_id!r}")


async def test_vision_model(deployment_id: str):
base64_data = read_image_base64(Path("./image.png"))
base64_image = f"data:image/png;base64,{base64_data}"

messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe the image"},
{"type": "image_url", "image_url": {"url": base64_image}},
],
}
]

content = await dial_chat_completion(deployment_id, messages)

if "vision" not in content.lower():
raise ValueError(f"Test failed for {deployment_id!r}")

async def tests():
await test_model("ollama-model")
async with timer("Testing chat-model"):
await test_chat_model("chat-model")

async with timer("Testing vision-model"):
await test_vision_model("vision-model")


if __name__ == "__main__":
Expand Down
Binary file added dial-docker-compose/ci/ollama/test/image.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
24 changes: 20 additions & 4 deletions dial-docker-compose/ollama/core/config.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
{
"routes": {},
"models": {
"ollama-model": {
"chat-model": {
"type": "chat",
"displayName": "Self-hosted model",
"displayName": "Self-hosted chat model",
"endpoint": "http://ollama:11434/v1/chat/completions"
},
"vision-model": {
"type": "chat",
"displayName": "Self-hosted vision model",
"endpoint": "http://adapter-openai:5000/openai/deployments/vision-model/chat/completions",
"inputAttachmentTypes": [
"image/png",
"image/jpeg"
],
"upstreams": [
{
"endpoint": "http://ollama:11434/v1/chat/completions",
"key": "dummy-key"
}
]
}
},
"keys": {
Expand All @@ -16,8 +31,9 @@
"roles": {
"default": {
"limits": {
"ollama-model": {}
"chat-model": {},
"vision-model": {}
}
}
}
}
}
16 changes: 11 additions & 5 deletions dial-docker-compose/ollama/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,21 @@ services:
depends_on:
ollama:
condition: service_started
image: alpine:3.20.3
build: ./ollama_setup
environment:
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=${OLLAMA_MODEL:-llama3.1:8b-instruct-q4_K_M}
volumes:
- ./ollama_setup.sh:/setup.sh
command: sh /setup.sh
- OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL}
- OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL}
healthcheck:
test: ["CMD", "test", "-f", "/healthy"]
interval: 10s
start_period: 10s
retries: 10

adapter-openai:
image: epam/ai-dial-adapter-openai:0.14.0
environment:
WEB_CONCURRENCY: "3"
DIAL_URL: "http://core:8080"
DIAL_USE_FILE_STORAGE: "True"
GPT4_VISION_DEPLOYMENTS: "vision-model"
24 changes: 0 additions & 24 deletions dial-docker-compose/ollama/ollama_setup.sh

This file was deleted.

4 changes: 4 additions & 0 deletions dial-docker-compose/ollama/ollama_setup/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
FROM python:3.11-alpine
RUN pip install requests
COPY setup.py /setup.py
CMD ["sh", "-c", "python /setup.py && tail -f /dev/null"]
87 changes: 87 additions & 0 deletions dial-docker-compose/ollama/ollama_setup/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import asyncio
from contextlib import asynccontextmanager
import logging
import os
import time
import requests

OLLAMA_URL = os.getenv("OLLAMA_URL")
if OLLAMA_URL is None:
raise RuntimeError("OLLAMA_URL env var isn't set")

OLLAMA_CHAT_MODEL = os.getenv("OLLAMA_CHAT_MODEL")
OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL")

HEALTH_FILE = "/healthy"

log = logging.getLogger(__name__)
log.setLevel(logging.INFO)

log.info(f"OLLAMA_URL = {OLLAMA_URL}")
log.info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}")
log.info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}")


@asynccontextmanager
async def timer(name: str):
log.info(f"[{name}] Starting...")
start = time.perf_counter()
yield
elapsed = time.perf_counter() - start
log.info(f"[{name}] Executed in {elapsed:.2f} seconds")


async def wait_for_ollama():
while True:
try:
if requests.get(OLLAMA_URL).ok:
break
except requests.RequestException:
pass
await asyncio.sleep(1)


async def pull_model(model):
data = {"name": model, "stream": False}
requests.post(f"{OLLAMA_URL}/api/pull", json=data).raise_for_status()


async def create_alias(source, dest):
data = {"source": source, "destination": dest}
requests.post(f"{OLLAMA_URL}/api/copy", json=data).raise_for_status()


async def load_model(model):
data = {"model": model}
requests.post(f"{OLLAMA_URL}/api/generate", json=data).raise_for_status()


async def mark_as_healthy():
open(HEALTH_FILE, "w").close()


async def main():
async with timer("Waiting for Ollama to start"):
await wait_for_ollama()

for model, alias in [
(OLLAMA_CHAT_MODEL, "chat-model"),
(OLLAMA_VISION_MODEL, "vision-model"),
]:
if model:
async with timer(f"Pulling model {model}"):
await pull_model(model)

async with timer(f"Creating alias for {model}: {alias}"):
await create_alias(model, alias)

if model_to_load := OLLAMA_CHAT_MODEL or OLLAMA_VISION_MODEL:
async with timer(f"Loading model {model_to_load} into memory"):
await load_model(model_to_load)

await mark_as_healthy()


if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
63 changes: 50 additions & 13 deletions docs/tutorials/quick-start-with-self-hosted-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,57 @@ Docker engine installed on your machine (Docker Compose Version 2.20.0 +).
## Step 1: Get AI DIAL

[Download](https://github.com/epam/ai-dial/tree/main/dial-docker-compose/ollama/) AI DIAL.
Clone [the repository](https://github.com/epam/ai-dial/) with the tutorials and change directory to the following folder:

## Step 2: Launch AI DIAL Chat
```sh
cd dial-docker-compose/ollama
```

Run `docker compose up` from the folder with the [docker-compose file](https://github.com/epam/ai-dial/blob/main/dial-docker-compose/ollama/docker-compose.yml).
## Step 2: Choose a model to run

> By default [llama3.1:8b-instruct-q4_K_M](https://ollama.com/library/llama3.1:8b-instruct-q4_K_M) model will be pulled and loaded into the memory of the Ollama server automatically. This is going to take around a minute on the first run.
>
> You could specify the model via the environment variable `OLLAMA_MODEL`:
>
> ```sh
> OLLAMA_MODEL=model_of_your_choice docker compose up
> ```
>
> Find the available models at the [Ollama model library](https://ollama.com/library).
Ollama supports a wide range of popular open-source models.

Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and chat with the model.
Consider first the modality your interested in - is a regular text-to-text chat model or a multi-modal vision model?

Follow the feature tags at https://ollama.com/search to find an appropriate model, e.g. one working with `Code`, supporting `Tools` or a `Vision` feature.

We recommend to choose one of the following models which have been tested:

|Model|Vision|Tools|Streaming|
|----|----|----|----|
|[llama3.1:8b-instruct-q4_0](https://ollama.com/library/llama3.1:8b-instruct-q4_0)||*(only in non-streaming mode)*||
|[mistral:7b-instruct-q4_0](https://ollama.com/library/mistral:7b-instruct-q4_0)||||
|[phi3.5:3.8b-mini-instruct-q4_0](https://ollama.com/library/phi3.5:3.8b-mini-instruct-q4_0)||||
|[gemma2:2b-instruct-q4_0](https://ollama.com/library/gemma2:2b-instruct-q4_0)||||
|[llava:7b-v1.6-mistral-q4_0](https://ollama.com/library/llava:7b-v1.6-mistral-q4_0)||||
|[llava-phi3:3.8b-mini-q4_0](https://ollama.com/library/llava-phi3:3.8b-mini-q4_0)||||

All the listed models support streaming.

## Step 3: Launch AI DIAL Chat

### Chat model

If you have chosen a regular chat model _(e.g. llama3.1:8b-instruct-q4_0)_, then run the command:

```sh
OLLAMA_CHAT_MODEL=model_of_your_choice docker compose up --abort-on-container-exit
```

The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run.

Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted chat model` deployment to converse with the model.

### Vision model

If you have chosen a vision model _(e.g. llava-phi3:3.8b-mini-q4_0)_, then run the command:

```sh
OLLAMA_VISION_MODEL=model_of_your_choice docker compose up --abort-on-container-exit
```

The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run.

Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted vision model` deployment to converse with the model.

> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond.

0 comments on commit c51418d

Please sign in to comment.