diff --git a/dial-docker-compose/ci/ollama/test/app.py b/dial-docker-compose/ci/ollama/test/app.py index 66e49085..b3af19a8 100644 --- a/dial-docker-compose/ci/ollama/test/app.py +++ b/dial-docker-compose/ci/ollama/test/app.py @@ -58,7 +58,8 @@ async def dial_chat_completion(deployment_id: str, messages: list) -> str: payload = { "model": deployment_id, "messages": messages, - "stream": False, + "temperature": 0.0, + "stream": False } headers = {"api-key": DIAL_API_KEY} params = {"api-version": DIAL_API_VERSION} diff --git a/dial-docker-compose/common.yml b/dial-docker-compose/common.yml index 8e2ae74a..3a5e1912 100644 --- a/dial-docker-compose/common.yml +++ b/dial-docker-compose/common.yml @@ -17,6 +17,7 @@ services: DIAL_API_HOST: "http://core:8080" DIAL_API_KEY: "dial_api_key" ENABLED_FEATURES: "conversations-section,prompts-section,top-settings,top-clear-conversation,top-chat-info,top-chat-model-settings,empty-chat-settings,header,footer,request-api-key,report-an-issue,likes,input-files,attachments-manager" + KEEP_ALIVE_TIMEOUT: ${CHAT_KEEP_ALIVE_TIMEOUT} redis: image: redis:7.2.4-alpine3.19 diff --git a/dial-docker-compose/ollama/.env b/dial-docker-compose/ollama/.env index cdabc6b5..88f47f39 100644 --- a/dial-docker-compose/ollama/.env +++ b/dial-docker-compose/ollama/.env @@ -1,4 +1,6 @@ DIAL_DIR="./ollama" +CHAT_KEEP_ALIVE_TIMEOUT=600000 + OLLAMA_CHAT_MODEL= OLLAMA_VISION_MODEL= OLLAMA_EMBEDDING_MODEL= \ No newline at end of file diff --git a/dial-docker-compose/ollama/docker-compose.yml b/dial-docker-compose/ollama/docker-compose.yml index 68e316f0..25fb1ebb 100644 --- a/dial-docker-compose/ollama/docker-compose.yml +++ b/dial-docker-compose/ollama/docker-compose.yml @@ -20,11 +20,6 @@ services: - OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL} - OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL} - OLLAMA_EMBEDDING_MODEL=${OLLAMA_EMBEDDING_MODEL} - healthcheck: - test: ["CMD", "test", "-f", "/healthy"] - interval: 10s - start_period: 10s - retries: 10 adapter-openai: image: epam/ai-dial-adapter-openai:0.14.0 diff --git a/dial-docker-compose/ollama/ollama_setup/.dockerignore b/dial-docker-compose/ollama/ollama_setup/.dockerignore new file mode 100644 index 00000000..0b65cd6d --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/.dockerignore @@ -0,0 +1,2 @@ +.dockerignore +.venv \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/Dockerfile b/dial-docker-compose/ollama/ollama_setup/Dockerfile index 91b223bf..626ad803 100644 --- a/dial-docker-compose/ollama/ollama_setup/Dockerfile +++ b/dial-docker-compose/ollama/ollama_setup/Dockerfile @@ -1,7 +1,14 @@ FROM python:3.11-alpine +RUN apk --no-cache add curl + WORKDIR /app COPY * /app RUN pip install -r requirements.txt -CMD ["sh", "-c", "python setup.py && tail -f /dev/null"] +EXPOSE 5000 + +HEALTHCHECK --interval=10s --timeout=1s --start-period=10s --retries=10 \ + CMD curl --fail http://localhost:5000/health || exit 1 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "5000"] \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/setup.py b/dial-docker-compose/ollama/ollama_setup/app.py similarity index 57% rename from dial-docker-compose/ollama/ollama_setup/setup.py rename to dial-docker-compose/ollama/ollama_setup/app.py index 3e27663e..5448244e 100755 --- a/dial-docker-compose/ollama/ollama_setup/setup.py +++ b/dial-docker-compose/ollama/ollama_setup/app.py @@ -1,12 +1,13 @@ import asyncio from contextlib import asynccontextmanager import os -import sys -import time import asyncio +from fastapi import FastAPI from ollama import AsyncClient from tqdm import tqdm +from utils import Writer, print_info, timer + OLLAMA_URL = os.getenv("OLLAMA_URL") if OLLAMA_URL is None: raise RuntimeError("OLLAMA_URL env var isn't set") @@ -15,46 +16,15 @@ OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL") OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL") -HEALTH_FILE = "/healthy" - - -class Writer: - @classmethod - def write(cls, s: str): - # NOTE: every tqdm progress bar update is deliberately ended with "\n", - # otherwise one wouldn't see the bar running in console upon running `docker compose up`. - print(s, file=sys.stdout, flush=True, end="\n") - - @classmethod - def flush(cls): - sys.stdout.flush() - - -print_info = Writer.write - -print_info(f"OLLAMA_URL = {OLLAMA_URL}") -print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}") -print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}") -print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}") - - -@asynccontextmanager -async def timer(name: str): - print_info(f"[{name}] Starting...") - start = time.perf_counter() - yield - elapsed = time.perf_counter() - start - print_info(f"[{name}] Finished in {elapsed:.2f} seconds") - async def wait_for_startup(): - attempt = 0 + attempts = 0 while True: - attempt += 1 + attempts += 1 try: await AsyncClient(host=OLLAMA_URL, timeout=5).ps() except Exception: - print_info(f"[{attempt:>3}] Waiting for Ollama to start...") + print_info(f"[{attempts:>3}] Waiting for Ollama to start...") await asyncio.sleep(5) else: break @@ -73,30 +43,34 @@ async def pull_model(client: AsyncClient, model: str): if status != prev_status and total: prev_status = status - if progress_bar: - progress_bar.close() progress_bar = tqdm( - total=total, unit="B", unit_scale=True, desc=f"[{status}]", file=Writer + total=total, + unit="B", + unit_scale=True, + desc=f"[{status}]", + mininterval=1, + file=Writer, ) - if completed and progress_bar and total: + if completed and total and progress_bar: progress_bar.n = completed - progress_bar.set_description(f"[{status}]") - progress_bar.refresh() + progress_bar.update(n=0) if total and total == completed and progress_bar: progress_bar.close() + progress_bar = None if not completed and not total: print_info(f"[{status}]") -async def create_health_mark(): - open(HEALTH_FILE, "w").close() - +async def startup(): + print_info(f"OLLAMA_URL = {OLLAMA_URL}") + print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}") + print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}") + print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}") -async def main(): - client = AsyncClient(host=OLLAMA_URL, timeout=300000) + client = AsyncClient(host=OLLAMA_URL, timeout=300) async with timer("Waiting for Ollama to start"): await wait_for_startup() @@ -117,11 +91,18 @@ async def main(): async with timer(f"Loading model {model_to_load} into memory"): await client.generate(model_to_load) - await create_health_mark() - print_info("The Ollama server is up and running.") -if __name__ == "__main__": - loop = asyncio.get_event_loop() - loop.run_until_complete(main()) +@asynccontextmanager +async def lifespan(app): + await startup() + yield + + +app = FastAPI(lifespan=lifespan) + + +@app.get("/health") +def health_check(): + return {"status": "ok"} diff --git a/dial-docker-compose/ollama/ollama_setup/requirements.txt b/dial-docker-compose/ollama/ollama_setup/requirements.txt index ac6a93f1..6d2cfc8c 100644 --- a/dial-docker-compose/ollama/ollama_setup/requirements.txt +++ b/dial-docker-compose/ollama/ollama_setup/requirements.txt @@ -1,3 +1,5 @@ httpx==0.27.2 tqdm==4.66.5 -ollama==0.3.3 \ No newline at end of file +ollama==0.3.3 +fastapi==0.115.0 +uvicorn==0.30.6 \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/utils.py b/dial-docker-compose/ollama/ollama_setup/utils.py new file mode 100644 index 00000000..f8f6d6eb --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/utils.py @@ -0,0 +1,38 @@ +import logging +import sys +import time +from contextlib import asynccontextmanager + + +class Writer: + @classmethod + def write(cls, s: str): + # NOTE: every tqdm progress bar update is deliberately ended with "\n", + # otherwise one wouldn't see the bar running in console upon running `docker compose up`. + if s in ["\n", ""]: + return + print(s.strip(), file=sys.stderr, flush=True, end="\n") + + @classmethod + def flush(cls): + sys.stderr.flush() + + +print_info = Writer.write + + +@asynccontextmanager +async def timer(name: str): + print_info(f"[{name}] Starting...") + start = time.perf_counter() + yield + elapsed = time.perf_counter() - start + print_info(f"[{name}] Finished in {elapsed:.2f} seconds") + + +class HealthFilter(logging.Filter): + def filter(self, record: logging.LogRecord) -> bool: + return record.getMessage().find("/health") == -1 + + +logging.getLogger("uvicorn.access").addFilter(HealthFilter()) diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index 77b18828..459c95db 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -65,15 +65,15 @@ All the models support streaming. docker compose up --abort-on-container-exit ``` - > Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or dozens of minutes)_ to download them on the first run depending on your Internet bandwidth. + > Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or more)_ to download it on the first run, depending on your internet bandwidth and the size of the model you choose. > - > The model is fully loaded once `ollama-setup` service prints `The Ollama server is up and running.` + > The models are fully loaded once `ollama-setup` service prints `The Ollama server is up and running.` 3. Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select an appropriate AI DIAL deployments to converse with: * `Self-hosted chat model` deployment for the `OLLAMA_CHAT_MODEL` * `Self-hosted vision model` deployment for the `OLLAMA_VISION_MODEL` -> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond. + > Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond. -The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`. + The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`.