feat: added instructions for self-hosted vision models

epam · Sep 17, 2024 · c51418d · c51418d
1 parent 626b1d6
commit c51418d
Show file tree

Hide file tree

Showing 8 changed files with 225 additions and 52 deletions.
diff --git a/dial-docker-compose/ci/ollama/test/app.py b/dial-docker-compose/ci/ollama/test/app.py
@@ -1,9 +1,13 @@
+import base64
 import os
+from pathlib import Path
 import aiohttp
 import asyncio
 import backoff
 
 import logging
+import time
+from contextlib import asynccontextmanager
 
 
 def get_env(name: str) -> str:
@@ -21,6 +25,15 @@ def get_env(name: str) -> str:
 log = logging.getLogger(__name__)
 
 
+@asynccontextmanager
+async def timer(name: str):
+    log.debug(f"[{name}] Starting...")
+    start = time.perf_counter()
+    yield
+    elapsed = time.perf_counter() - start
+    log.debug(f"[{name}] Executed in {elapsed:.2f} seconds")
+
+
 @backoff.on_exception(
     backoff.expo,
     (aiohttp.ClientError, aiohttp.ServerTimeoutError),
@@ -35,13 +48,15 @@ async def post_with_retry(url: str, payload: dict, headers: dict, params: dict):
             return await response.json()
 
 
-async def test_model(deployment_id: str):
+def read_image_base64(png_file: Path) -> str:
+    return base64.b64encode(png_file.read_bytes()).decode("utf-8")
+
+async def dial_chat_completion(deployment_id: str, messages: list) -> str:
     api_url = f"{DIAL_URL}/openai/deployments/{deployment_id}/chat/completions"
 
-    message = "12 + 23 = ? Reply with a single number:"
     payload = {
         "model": deployment_id,
-        "messages": [{"role": "user", "content": message}],
+        "messages": messages,
         "stream": False,
     }
     headers = {"api-key": DIAL_API_KEY}
@@ -52,12 +67,44 @@ async def test_model(deployment_id: str):
 
     content = body.get("choices", [])[0].get("message", {}).get("content", "")
 
-    if "35" not in content:
-        raise ValueError(f"Test failed for {deployment_id!r}. ")
+    log.debug(f"Content: {content}")
+
+    return content
+
+async def test_chat_model(deployment_id: str):
+    message = "2 + 3 = ? Reply with a single number:"
+    messages = [{"role": "user", "content": message}]
+    content = await dial_chat_completion(deployment_id, messages)
+
+    if "5" not in content:
+        raise ValueError(f"Test failed for {deployment_id!r}")
 
 
+async def test_vision_model(deployment_id: str):
+    base64_data = read_image_base64(Path("./image.png"))
+    base64_image = f"data:image/png;base64,{base64_data}"
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Describe the image"},
+                {"type": "image_url", "image_url": {"url": base64_image}},
+            ],
+        }
+    ]
+
+    content = await dial_chat_completion(deployment_id, messages)
+
+    if "vision" not in content.lower():
+        raise ValueError(f"Test failed for {deployment_id!r}")
+
 async def tests():
-    await test_model("ollama-model")
+    async with timer("Testing chat-model"):
+        await test_chat_model("chat-model")
+
+    async with timer("Testing vision-model"):
+        await test_vision_model("vision-model")
 
 
 if __name__ == "__main__":

diff --git a/dial-docker-compose/ci/ollama/test/image.png b/dial-docker-compose/ci/ollama/test/image.png
diff --git a/dial-docker-compose/ollama/core/config.json b/dial-docker-compose/ollama/core/config.json
@@ -1,10 +1,25 @@
 {
   "routes": {},
   "models": {
-    "ollama-model": {
+    "chat-model": {
       "type": "chat",
-      "displayName": "Self-hosted model",
+      "displayName": "Self-hosted chat model",
       "endpoint": "http://ollama:11434/v1/chat/completions"
+    },
+    "vision-model": {
+      "type": "chat",
+      "displayName": "Self-hosted vision model",
+      "endpoint": "http://adapter-openai:5000/openai/deployments/vision-model/chat/completions",
+      "inputAttachmentTypes": [
+        "image/png",
+        "image/jpeg"
+      ],
+      "upstreams": [
+        {
+          "endpoint": "http://ollama:11434/v1/chat/completions",
+          "key": "dummy-key"
+        }
+      ]
     }
   },
   "keys": {
@@ -16,8 +31,9 @@
   "roles": {
     "default": {
       "limits": {
-        "ollama-model": {}
+        "chat-model": {},
+        "vision-model": {}
       }
     }
   }
-}
+}
diff --git a/dial-docker-compose/ollama/docker-compose.yml b/dial-docker-compose/ollama/docker-compose.yml
@@ -14,15 +14,21 @@ services:
     depends_on:
       ollama:
         condition: service_started
-    image: alpine:3.20.3
+    build: ./ollama_setup
     environment:
       - OLLAMA_URL=http://ollama:11434
-      - OLLAMA_MODEL=${OLLAMA_MODEL:-llama3.1:8b-instruct-q4_K_M}
-    volumes:
-      - ./ollama_setup.sh:/setup.sh
-    command: sh /setup.sh
+      - OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL}
+      - OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL}
     healthcheck:
       test: ["CMD", "test", "-f", "/healthy"]
       interval: 10s
       start_period: 10s
       retries: 10
+
+  adapter-openai:
+    image: epam/ai-dial-adapter-openai:0.14.0
+    environment:
+      WEB_CONCURRENCY: "3"
+      DIAL_URL: "http://core:8080"
+      DIAL_USE_FILE_STORAGE: "True"
+      GPT4_VISION_DEPLOYMENTS: "vision-model"
diff --git a/dial-docker-compose/ollama/ollama_setup.sh b/dial-docker-compose/ollama/ollama_setup.sh
diff --git a/dial-docker-compose/ollama/ollama_setup/Dockerfile b/dial-docker-compose/ollama/ollama_setup/Dockerfile
@@ -0,0 +1,4 @@
+FROM python:3.11-alpine
+RUN pip install requests
+COPY setup.py /setup.py
+CMD ["sh", "-c", "python /setup.py && tail -f /dev/null"]
diff --git a/dial-docker-compose/ollama/ollama_setup/setup.py b/dial-docker-compose/ollama/ollama_setup/setup.py
@@ -0,0 +1,87 @@
+import asyncio
+from contextlib import asynccontextmanager
+import logging
+import os
+import time
+import requests
+
+OLLAMA_URL = os.getenv("OLLAMA_URL")
+if OLLAMA_URL is None:
+    raise RuntimeError("OLLAMA_URL env var isn't set")
+
+OLLAMA_CHAT_MODEL = os.getenv("OLLAMA_CHAT_MODEL")
+OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL")
+
+HEALTH_FILE = "/healthy"
+
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)
+
+log.info(f"OLLAMA_URL = {OLLAMA_URL}")
+log.info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}")
+log.info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}")
+
+
+@asynccontextmanager
+async def timer(name: str):
+    log.info(f"[{name}] Starting...")
+    start = time.perf_counter()
+    yield
+    elapsed = time.perf_counter() - start
+    log.info(f"[{name}] Executed in {elapsed:.2f} seconds")
+
+
+async def wait_for_ollama():
+    while True:
+        try:
+            if requests.get(OLLAMA_URL).ok:
+                break
+        except requests.RequestException:
+            pass
+        await asyncio.sleep(1)
+
+
+async def pull_model(model):
+    data = {"name": model, "stream": False}
+    requests.post(f"{OLLAMA_URL}/api/pull", json=data).raise_for_status()
+
+
+async def create_alias(source, dest):
+    data = {"source": source, "destination": dest}
+    requests.post(f"{OLLAMA_URL}/api/copy", json=data).raise_for_status()
+
+
+async def load_model(model):
+    data = {"model": model}
+    requests.post(f"{OLLAMA_URL}/api/generate", json=data).raise_for_status()
+
+
+async def mark_as_healthy():
+    open(HEALTH_FILE, "w").close()
+
+
+async def main():
+    async with timer("Waiting for Ollama to start"):
+        await wait_for_ollama()
+
+    for model, alias in [
+        (OLLAMA_CHAT_MODEL, "chat-model"),
+        (OLLAMA_VISION_MODEL, "vision-model"),
+    ]:
+        if model:
+            async with timer(f"Pulling model {model}"):
+                await pull_model(model)
+
+            async with timer(f"Creating alias for {model}: {alias}"):
+                await create_alias(model, alias)
+
+    if model_to_load := OLLAMA_CHAT_MODEL or OLLAMA_VISION_MODEL:
+        async with timer(f"Loading model {model_to_load} into memory"):
+            await load_model(model_to_load)
+
+    await mark_as_healthy()
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main())
diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md
@@ -12,20 +12,57 @@ Docker engine installed on your machine (Docker Compose Version 2.20.0 +).
 
 ## Step 1: Get AI DIAL
 
-[Download](https://github.com/epam/ai-dial/tree/main/dial-docker-compose/ollama/) AI DIAL.
+Clone [the repository](https://github.com/epam/ai-dial/) with the tutorials and change directory to the following folder:
 
-## Step 2: Launch AI DIAL Chat
+```sh
+cd dial-docker-compose/ollama
+```
 
-Run `docker compose up` from the folder with the [docker-compose file](https://github.com/epam/ai-dial/blob/main/dial-docker-compose/ollama/docker-compose.yml).
+## Step 2: Choose a model to run
 
-> By default [llama3.1:8b-instruct-q4_K_M](https://ollama.com/library/llama3.1:8b-instruct-q4_K_M) model will be pulled and loaded into the memory of the Ollama server automatically. This is going to take around a minute on the first run.
->
-> You could specify the model via the environment variable `OLLAMA_MODEL`:
->
-> ```sh
-> OLLAMA_MODEL=model_of_your_choice docker compose up
-> ```
->
-> Find the available models at the [Ollama model library](https://ollama.com/library).
+Ollama supports a wide range of popular open-source models.
 
-Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and chat with the model.
+Consider first the modality your interested in - is a regular text-to-text chat model or a multi-modal vision model?
+
+Follow the feature tags at https://ollama.com/search to find an appropriate model, e.g. one working with `Code`, supporting `Tools` or a `Vision` feature.
+
+We recommend to choose one of the following models which have been tested:
+
+|Model|Vision|Tools|Streaming|
+|----|----|----|----|
+|[llama3.1:8b-instruct-q4_0](https://ollama.com/library/llama3.1:8b-instruct-q4_0)|❌|✅ *(only in non-streaming mode)*|✅|
+|[mistral:7b-instruct-q4_0](https://ollama.com/library/mistral:7b-instruct-q4_0)|❌|❌|✅|
+|[phi3.5:3.8b-mini-instruct-q4_0](https://ollama.com/library/phi3.5:3.8b-mini-instruct-q4_0)|❌|❌|✅|
+|[gemma2:2b-instruct-q4_0](https://ollama.com/library/gemma2:2b-instruct-q4_0)|❌|❌|✅|
+|[llava:7b-v1.6-mistral-q4_0](https://ollama.com/library/llava:7b-v1.6-mistral-q4_0)|✅|❌|❌|
+|[llava-phi3:3.8b-mini-q4_0](https://ollama.com/library/llava-phi3:3.8b-mini-q4_0)|✅|❌|❌|
+
+All the listed models support streaming.
+
+## Step 3: Launch AI DIAL Chat
+
+### Chat model
+
+If you have chosen a regular chat model _(e.g. llama3.1:8b-instruct-q4_0)_, then run the command:
+
+```sh
+OLLAMA_CHAT_MODEL=model_of_your_choice docker compose up --abort-on-container-exit
+```
+
+The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run.
+
+Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted chat model` deployment to converse with the model.
+
+### Vision model
+
+If you have chosen a vision model _(e.g. llava-phi3:3.8b-mini-q4_0)_, then run the command:
+
+```sh
+OLLAMA_VISION_MODEL=model_of_your_choice docker compose up --abort-on-container-exit
+```
+
+The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run.
+
+Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted vision model` deployment to converse with the model.
+
+> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond.