From a96274c24b8d9fcc2639f0075e375ba536535111 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Mon, 16 Sep 2024 10:33:23 +0000 Subject: [PATCH 01/16] feat: added docker-compose and test for llama3:8b via ollama --- .github/workflows/pr.yml | 5 ++ .github/workflows/release.yml | 5 ++ .gitignore | 7 +- dial-docker-compose/addon/docker-compose.yml | 2 +- dial-docker-compose/ci/ollama/.env | 1 + .../ci/ollama/docker-compose.yml | 17 +++++ .../ci/ollama/test/.dockerignore | 1 + dial-docker-compose/ci/ollama/test/Dockerfile | 7 ++ dial-docker-compose/ci/ollama/test/app.py | 66 +++++++++++++++++++ .../ci/ollama/test/requirements.txt | 2 + dial-docker-compose/common.yml | 6 +- dial-docker-compose/model/docker-compose.yml | 2 +- dial-docker-compose/ollama/.env | 1 + dial-docker-compose/ollama/core/config.json | 23 +++++++ dial-docker-compose/ollama/docker-compose.yml | 30 +++++++++ dial-docker-compose/ollama/ollama_setup.sh | 26 ++++++++ dial-docker-compose/settings/settings.json | 4 +- 17 files changed, 196 insertions(+), 9 deletions(-) create mode 100644 dial-docker-compose/ci/ollama/.env create mode 100644 dial-docker-compose/ci/ollama/docker-compose.yml create mode 100644 dial-docker-compose/ci/ollama/test/.dockerignore create mode 100644 dial-docker-compose/ci/ollama/test/Dockerfile create mode 100644 dial-docker-compose/ci/ollama/test/app.py create mode 100644 dial-docker-compose/ci/ollama/test/requirements.txt create mode 100644 dial-docker-compose/ollama/.env create mode 100644 dial-docker-compose/ollama/core/config.json create mode 100644 dial-docker-compose/ollama/docker-compose.yml create mode 100755 dial-docker-compose/ollama/ollama_setup.sh diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 5a5fff39..06e5d3ce 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -38,6 +38,11 @@ jobs: with: cwd: "./dial-docker-compose/ci/addon" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 + name: Run quickstart ollama example + with: + cwd: "./dial-docker-compose/ci/ollama" + up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" build: needs: [run-notebooks, run-quickstart] diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 38cdb0ba..9b7f475e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -38,6 +38,11 @@ jobs: with: cwd: "./dial-docker-compose/ci/addon" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 + name: Run quickstart ollama example + with: + cwd: "./dial-docker-compose/ci/ollama" + up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" build-and-deploy: needs: [run-notebooks, run-quickstart] diff --git a/.gitignore b/.gitignore index 288b27ff..1c0d3cb8 100644 --- a/.gitignore +++ b/.gitignore @@ -23,8 +23,11 @@ yarn-error.log* .venv __pycache__ -# DIAL Core logs -*.log +# Docker container volumes +core-data +core-logs +.ollama + /.quarto/ # Autogenerated files by Quarto diff --git a/dial-docker-compose/addon/docker-compose.yml b/dial-docker-compose/addon/docker-compose.yml index 9df5bd32..65c3715d 100644 --- a/dial-docker-compose/addon/docker-compose.yml +++ b/dial-docker-compose/addon/docker-compose.yml @@ -4,7 +4,7 @@ include: services: adapter-openai: - image: epam/ai-dial-adapter-openai:0.11.0 + image: epam/ai-dial-adapter-openai:0.14.0 environment: WEB_CONCURRENCY: "3" diff --git a/dial-docker-compose/ci/ollama/.env b/dial-docker-compose/ci/ollama/.env new file mode 100644 index 00000000..4daa1416 --- /dev/null +++ b/dial-docker-compose/ci/ollama/.env @@ -0,0 +1 @@ +DIAL_DIR="./ollama" \ No newline at end of file diff --git a/dial-docker-compose/ci/ollama/docker-compose.yml b/dial-docker-compose/ci/ollama/docker-compose.yml new file mode 100644 index 00000000..b209b73f --- /dev/null +++ b/dial-docker-compose/ci/ollama/docker-compose.yml @@ -0,0 +1,17 @@ +include: + - path: ../../ollama/docker-compose.yml + env_file: ./.env + +services: + test: + build: test + environment: + DIAL_URL: "http://core:8080" + DIAL_API_KEY: "dial_api_key" + DIAL_API_VERSION: "2024-02-01" + DIAL_DEPLOYMENT: "llama3:8b" + depends_on: + ollama-setup: + condition: service_healthy + core: + condition: service_healthy \ No newline at end of file diff --git a/dial-docker-compose/ci/ollama/test/.dockerignore b/dial-docker-compose/ci/ollama/test/.dockerignore new file mode 100644 index 00000000..1d1fe94d --- /dev/null +++ b/dial-docker-compose/ci/ollama/test/.dockerignore @@ -0,0 +1 @@ +Dockerfile \ No newline at end of file diff --git a/dial-docker-compose/ci/ollama/test/Dockerfile b/dial-docker-compose/ci/ollama/test/Dockerfile new file mode 100644 index 00000000..c41fc702 --- /dev/null +++ b/dial-docker-compose/ci/ollama/test/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.11-alpine + +WORKDIR /app +COPY * /app +RUN pip install -r requirements.txt + +CMD ["python", "app.py"] \ No newline at end of file diff --git a/dial-docker-compose/ci/ollama/test/app.py b/dial-docker-compose/ci/ollama/test/app.py new file mode 100644 index 00000000..95460af4 --- /dev/null +++ b/dial-docker-compose/ci/ollama/test/app.py @@ -0,0 +1,66 @@ +import os +import aiohttp +import asyncio +import backoff + +import logging + + +def get_env(name: str) -> str: + value = os.environ.get(name) + if value is None: + raise ValueError(f"'{name}' environment variable must be defined") + return value + + +DIAL_URL = get_env("DIAL_URL") +DIAL_API_KEY = get_env("DIAL_API_KEY") +DIAL_API_VERSION = get_env("DIAL_API_VERSION") +DIAL_DEPLOYMENT = get_env("DIAL_DEPLOYMENT") + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger(__name__) + + +@backoff.on_exception( + backoff.expo, + (aiohttp.ClientError, aiohttp.ServerTimeoutError), + max_time=60, +) +async def post_with_retry(url: str, payload: dict, headers: dict, params: dict): + async with aiohttp.ClientSession() as session: + async with session.post( + url, json=payload, headers=headers, params=params + ) as response: + response.raise_for_status() + return await response.json() + + +async def test_model(deployment_id: str): + api_url = f"{DIAL_URL}/openai/deployments/{deployment_id}/chat/completions" + + message = "12 + 23 = ? Reply with a single number:" + payload = { + "model": deployment_id, + "messages": [{"role": "user", "content": message}], + "stream": False, + } + headers = {"api-key": DIAL_API_KEY} + params = {"api-version": DIAL_API_VERSION} + + body = await post_with_retry(api_url, payload, headers, params) + log.debug(f"Response: {body}") + + content = body.get("choices", [])[0].get("message", {}).get("content", "") + + if "35" not in content: + raise ValueError(f"Test failed for {deployment_id!r}. ") + + +async def tests(): + await test_model(DIAL_DEPLOYMENT) + + +if __name__ == "__main__": + loop = asyncio.get_event_loop() + loop.run_until_complete(tests()) diff --git a/dial-docker-compose/ci/ollama/test/requirements.txt b/dial-docker-compose/ci/ollama/test/requirements.txt new file mode 100644 index 00000000..1c6d30a5 --- /dev/null +++ b/dial-docker-compose/ci/ollama/test/requirements.txt @@ -0,0 +1,2 @@ +aiohttp==3.9.4 +backoff==2.2.1 \ No newline at end of file diff --git a/dial-docker-compose/common.yml b/dial-docker-compose/common.yml index 6da6adbb..8e2ae74a 100644 --- a/dial-docker-compose/common.yml +++ b/dial-docker-compose/common.yml @@ -1,13 +1,13 @@ services: themes: - image: epam/ai-dial-chat-themes:0.4.0 + image: epam/ai-dial-chat-themes:0.6.0 ports: - "3001:8080" chat: ports: - "3000:3000" - image: epam/ai-dial-chat:0.10.0 + image: epam/ai-dial-chat:0.17.0 depends_on: - themes - core @@ -36,7 +36,7 @@ services: user: ${UID:-root} ports: - "8080:8080" - image: epam/ai-dial-core:0.9.0 + image: epam/ai-dial-core:0.16.0 environment: 'AIDIAL_SETTINGS': '/opt/settings/settings.json' 'JAVA_OPTS': '-Dgflog.config=/opt/settings/gflog.xml' diff --git a/dial-docker-compose/model/docker-compose.yml b/dial-docker-compose/model/docker-compose.yml index 1dd03ae0..a918cb85 100644 --- a/dial-docker-compose/model/docker-compose.yml +++ b/dial-docker-compose/model/docker-compose.yml @@ -4,6 +4,6 @@ include: services: adapter-openai: - image: epam/ai-dial-adapter-openai:0.11.0 + image: epam/ai-dial-adapter-openai:0.14.0 environment: WEB_CONCURRENCY: "3" \ No newline at end of file diff --git a/dial-docker-compose/ollama/.env b/dial-docker-compose/ollama/.env new file mode 100644 index 00000000..4daa1416 --- /dev/null +++ b/dial-docker-compose/ollama/.env @@ -0,0 +1 @@ +DIAL_DIR="./ollama" \ No newline at end of file diff --git a/dial-docker-compose/ollama/core/config.json b/dial-docker-compose/ollama/core/config.json new file mode 100644 index 00000000..10b0d035 --- /dev/null +++ b/dial-docker-compose/ollama/core/config.json @@ -0,0 +1,23 @@ +{ + "routes": {}, + "models": { + "llama3:8b": { + "type": "chat", + "displayName": "Llama3 8B (Locally hosted)", + "endpoint": "http://ollama:11434/v1/chat/completions" + } + }, + "keys": { + "dial_api_key": { + "project": "TEST-PROJECT", + "role": "default" + } + }, + "roles": { + "default": { + "limits": { + "llama3:8b": {} + } + } + } +} diff --git a/dial-docker-compose/ollama/docker-compose.yml b/dial-docker-compose/ollama/docker-compose.yml new file mode 100644 index 00000000..418406d8 --- /dev/null +++ b/dial-docker-compose/ollama/docker-compose.yml @@ -0,0 +1,30 @@ +include: + - path: ../common.yml + env_file: ./.env + +services: + ollama: + image: ollama/ollama:0.3.10 + volumes: + - ./.ollama:/root/.ollama + ports: + - "11434:11434" + + ollama-setup: + depends_on: + ollama: + condition: service_started + image: alpine:3.20.3 + environment: + - OLLAMA_URL=http://ollama:11434 + - OLLAMA_ENABLED=${OLLAMA_ENABLED:-1} + - OLLAMA_MODEL=${OLLAMA_MODEL:-llama3:8b-instruct-q2_K} + - OLLAMA_MODEL_ALIAS=${OLLAMA_MODEL_ALIAS:-llama3:8b} + volumes: + - ./ollama_setup.sh:/setup.sh + command: sh /setup.sh + healthcheck: + test: ["CMD", "test", "-f", "/healthy"] + interval: 10s + start_period: 10s + retries: 10 diff --git a/dial-docker-compose/ollama/ollama_setup.sh b/dial-docker-compose/ollama/ollama_setup.sh new file mode 100755 index 00000000..5bb8abb4 --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup.sh @@ -0,0 +1,26 @@ +#!/bin/sh +set -e + +if [ "$OLLAMA_ENABLED" -eq 1 ]; then + apk add --no-cache curl + + until curl -s ${OLLAMA_URL}; do + echo "Waiting for Ollama..." + sleep 5 + done + + echo "Pulling $OLLAMA_URL..." + curl -vL --fail-with-body "$OLLAMA_URL/api/pull" -d "{\"name\": \"$OLLAMA_MODEL\", \"stream\": false}" + + echo "Making alias for $OLLAMA_URL: $OLLAMA_MODEL_ALIAS..." + curl -vL --fail-with-body "$OLLAMA_URL/api/copy" -d "{\"source\": \"$OLLAMA_MODEL\", \"destination\": \"$OLLAMA_MODEL_ALIAS\"}" + + echo "Loading the model into memory..." + curl -vL --fail-with-body "$OLLAMA_URL/api/generate" -d "{\"model\": \"$OLLAMA_MODEL_ALIAS\"}" +else + echo "Ollama is disabled" +fi + +touch /healthy + +tail -f /dev/null \ No newline at end of file diff --git a/dial-docker-compose/settings/settings.json b/dial-docker-compose/settings/settings.json index 446b1b6b..0471610d 100644 --- a/dial-docker-compose/settings/settings.json +++ b/dial-docker-compose/settings/settings.json @@ -9,7 +9,7 @@ } }, "encryption": { - "salt": "salt", - "password": "password" + "secret": "salt", + "key": "password" } } \ No newline at end of file From 79af610515ce1eaf19bc668024ed14d31859b3e2 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Mon, 16 Sep 2024 12:16:02 +0000 Subject: [PATCH 02/16] feat: added quick-start tutorial for self-hosted models --- README.md | 5 +-- dial-docker-compose/ollama/core/config.json | 6 ++-- dial-docker-compose/ollama/docker-compose.yml | 2 -- dial-docker-compose/ollama/ollama_setup.sh | 28 ++++++++--------- docs/README.md | 1 + .../quick-start-with-self-hosted-model.md | 31 +++++++++++++++++++ sidebars.js | 9 ++++-- 7 files changed, 58 insertions(+), 24 deletions(-) create mode 100644 docs/tutorials/quick-start-with-self-hosted-model.md diff --git a/README.md b/README.md index c169e8f2..db8095ca 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,11 @@ ## Helm Deployment * [AI DIAL Generic Installation Simple Guide](https://github.com/epam/ai-dial-helm/tree/main/charts/dial/examples/generic/simple) - + ## Tutorials * [Launch AI DIAL Chat with an Azure model](./docs/tutorials/quick-start-model.md) +* [Launch AI DIAL Chat with a self-hosted model](./docs/tutorials/quick-start-with-self-hosted-model.md) * [Launch AI DIAL Chat with a Sample Application](./docs/tutorials/quick-start-with-application.md) * [Launch AI DIAL Chat with a Sample Addon](./docs/tutorials/quick-start-with-addon.md) @@ -31,7 +32,7 @@ ## Configuration * Refer to [Configuration](./docs/Deployment/configuration.md) - + ## Other AI DIAL Project Open Source Repositories Here is the current list of repositories where you can find more details. You can also refer to [repository map](https://epam-rail.com/open-source). diff --git a/dial-docker-compose/ollama/core/config.json b/dial-docker-compose/ollama/core/config.json index 10b0d035..1a83c364 100644 --- a/dial-docker-compose/ollama/core/config.json +++ b/dial-docker-compose/ollama/core/config.json @@ -1,9 +1,9 @@ { "routes": {}, "models": { - "llama3:8b": { + "ollama-model": { "type": "chat", - "displayName": "Llama3 8B (Locally hosted)", + "displayName": "Self-hosted model", "endpoint": "http://ollama:11434/v1/chat/completions" } }, @@ -16,7 +16,7 @@ "roles": { "default": { "limits": { - "llama3:8b": {} + "ollama-model": {} } } } diff --git a/dial-docker-compose/ollama/docker-compose.yml b/dial-docker-compose/ollama/docker-compose.yml index 418406d8..cbbc914e 100644 --- a/dial-docker-compose/ollama/docker-compose.yml +++ b/dial-docker-compose/ollama/docker-compose.yml @@ -17,9 +17,7 @@ services: image: alpine:3.20.3 environment: - OLLAMA_URL=http://ollama:11434 - - OLLAMA_ENABLED=${OLLAMA_ENABLED:-1} - OLLAMA_MODEL=${OLLAMA_MODEL:-llama3:8b-instruct-q2_K} - - OLLAMA_MODEL_ALIAS=${OLLAMA_MODEL_ALIAS:-llama3:8b} volumes: - ./ollama_setup.sh:/setup.sh command: sh /setup.sh diff --git a/dial-docker-compose/ollama/ollama_setup.sh b/dial-docker-compose/ollama/ollama_setup.sh index 5bb8abb4..d45c4c89 100755 --- a/dial-docker-compose/ollama/ollama_setup.sh +++ b/dial-docker-compose/ollama/ollama_setup.sh @@ -1,25 +1,23 @@ #!/bin/sh set -e -if [ "$OLLAMA_ENABLED" -eq 1 ]; then - apk add --no-cache curl +apk add --no-cache curl - until curl -s ${OLLAMA_URL}; do - echo "Waiting for Ollama..." - sleep 5 - done +until curl -s ${OLLAMA_URL}; do + echo "Waiting for Ollama..." + sleep 5 +done - echo "Pulling $OLLAMA_URL..." - curl -vL --fail-with-body "$OLLAMA_URL/api/pull" -d "{\"name\": \"$OLLAMA_MODEL\", \"stream\": false}" +echo "Pulling $OLLAMA_URL..." +curl -vL --fail-with-body "$OLLAMA_URL/api/pull" -d "{\"name\": \"$OLLAMA_MODEL\", \"stream\": false}" - echo "Making alias for $OLLAMA_URL: $OLLAMA_MODEL_ALIAS..." - curl -vL --fail-with-body "$OLLAMA_URL/api/copy" -d "{\"source\": \"$OLLAMA_MODEL\", \"destination\": \"$OLLAMA_MODEL_ALIAS\"}" +OLLAMA_MODEL_ALIAS=ollama-model - echo "Loading the model into memory..." - curl -vL --fail-with-body "$OLLAMA_URL/api/generate" -d "{\"model\": \"$OLLAMA_MODEL_ALIAS\"}" -else - echo "Ollama is disabled" -fi +echo "Making alias for $OLLAMA_URL: $OLLAMA_MODEL_ALIAS..." +curl -vL --fail-with-body "$OLLAMA_URL/api/copy" -d "{\"source\": \"$OLLAMA_MODEL\", \"destination\": \"$OLLAMA_MODEL_ALIAS\"}" + +echo "Loading the model into memory..." +curl -vL --fail-with-body "$OLLAMA_URL/api/generate" -d "{\"model\": \"$OLLAMA_MODEL_ALIAS\"}" touch /healthy diff --git a/docs/README.md b/docs/README.md index 845f2859..de9d5580 100644 --- a/docs/README.md +++ b/docs/README.md @@ -21,6 +21,7 @@ ## Tutorials * [Launch AI DIAL Chat with an Azure model](./tutorials/quick-start-model.md) +* [Launch AI DIAL Chat with a self-hosted model](./tutorials/quick-start-with-self-hosted-model.md) * [Launch AI DIAL Chat with a Sample Application](./tutorials/quick-start-with-application.md) * [Launch AI DIAL Chat with a Sample Addon](./tutorials/quick-start-with-addon.md) diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md new file mode 100644 index 00000000..098d2a4a --- /dev/null +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -0,0 +1,31 @@ +# Launch AI DIAL Chat with a self-hosted model + +## Introduction + +In this tutorial, you will learn how to quickly launch AI DIAL Chat with a self-hosted model powered by [Ollama](https://ollama.com/). + +## Prerequisites + +Docker engine installed on your machine (Docker Compose Version 2.20.0 +). + +> Refer to [Docker](https://docs.docker.com/desktop/) documentation. + +## Step 1: Get AI DIAL + +[Download](https://github.com/epam/ai-dial/tree/main/dial-docker-compose/ollama/) AI DIAL. + +## Step 2: Launch AI DIAL Chat + +Run `docker compose up` from the folder with the [docker-compose file](https://github.com/epam/ai-dial/blob/main/dial-docker-compose/ollama/docker-compose.yml). + +By default a lightweight [llama3:8b-instruct-q2_K](https://ollama.com/library/llama3:8b-instruct-q2_K) model will be pulled and loaded into the memory of the Ollama server automatically. + +You could specify the model via the environment variable `OLLAMA_MODEL`: + +```sh +OLLAMA_MODEL=model_of_your_choice docker compose up +``` + +Find the available models at the [Ollama model library](https://ollama.com/library). + +Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and chat with the model. \ No newline at end of file diff --git a/sidebars.js b/sidebars.js index 84d90389..9314a6c5 100644 --- a/sidebars.js +++ b/sidebars.js @@ -102,6 +102,11 @@ const sidebars = { id: 'tutorials/quick-start-model', label: 'Chat with OpenAI Model', }, + { + type: 'doc', + id: 'tutorials/quick-start-with-self-hosted-model', + label: 'Chat with a self-hosted model', + }, { type: 'doc', id: 'tutorials/quick-start-with-addon', @@ -182,13 +187,13 @@ const sidebars = { Demos: [ { type: 'autogenerated', - dirName: 'video demos/demos', + dirName: 'video demos/demos', }, ], 'Demos For Developers': [ { type: 'autogenerated', - dirName: 'video demos/demos-for-developers', + dirName: 'video demos/demos-for-developers', }, ], }, From cc717ff2595891aa1a5e24e95a537c052a992a6e Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Mon, 16 Sep 2024 12:19:40 +0000 Subject: [PATCH 03/16] chore: formatting fix --- .../quick-start-with-self-hosted-model.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index 098d2a4a..7c9bdce3 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -18,14 +18,14 @@ Docker engine installed on your machine (Docker Compose Version 2.20.0 +). Run `docker compose up` from the folder with the [docker-compose file](https://github.com/epam/ai-dial/blob/main/dial-docker-compose/ollama/docker-compose.yml). -By default a lightweight [llama3:8b-instruct-q2_K](https://ollama.com/library/llama3:8b-instruct-q2_K) model will be pulled and loaded into the memory of the Ollama server automatically. - -You could specify the model via the environment variable `OLLAMA_MODEL`: - -```sh -OLLAMA_MODEL=model_of_your_choice docker compose up -``` - -Find the available models at the [Ollama model library](https://ollama.com/library). +> By default a lightweight [llama3:8b-instruct-q2_K](https://ollama.com/library/llama3:8b-instruct-q2_K) model will be pulled and loaded into the memory of the Ollama server automatically. +> +> You could specify the model via the environment variable `OLLAMA_MODEL`: +> +> ```sh +> OLLAMA_MODEL=model_of_your_choice docker compose up +> ``` +> +> Find the available models at the [Ollama model library](https://ollama.com/library). Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and chat with the model. \ No newline at end of file From a59961b9498e1da5a3fd959f6f81c624c8004e3a Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Mon, 16 Sep 2024 12:44:58 +0000 Subject: [PATCH 04/16] fix: fixed ci test for ollama model --- dial-docker-compose/ci/ollama/docker-compose.yml | 1 - dial-docker-compose/ci/ollama/test/app.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/dial-docker-compose/ci/ollama/docker-compose.yml b/dial-docker-compose/ci/ollama/docker-compose.yml index b209b73f..aff7e6cc 100644 --- a/dial-docker-compose/ci/ollama/docker-compose.yml +++ b/dial-docker-compose/ci/ollama/docker-compose.yml @@ -9,7 +9,6 @@ services: DIAL_URL: "http://core:8080" DIAL_API_KEY: "dial_api_key" DIAL_API_VERSION: "2024-02-01" - DIAL_DEPLOYMENT: "llama3:8b" depends_on: ollama-setup: condition: service_healthy diff --git a/dial-docker-compose/ci/ollama/test/app.py b/dial-docker-compose/ci/ollama/test/app.py index 95460af4..d3c133d1 100644 --- a/dial-docker-compose/ci/ollama/test/app.py +++ b/dial-docker-compose/ci/ollama/test/app.py @@ -16,7 +16,6 @@ def get_env(name: str) -> str: DIAL_URL = get_env("DIAL_URL") DIAL_API_KEY = get_env("DIAL_API_KEY") DIAL_API_VERSION = get_env("DIAL_API_VERSION") -DIAL_DEPLOYMENT = get_env("DIAL_DEPLOYMENT") logging.basicConfig(level=logging.DEBUG) log = logging.getLogger(__name__) @@ -58,7 +57,7 @@ async def test_model(deployment_id: str): async def tests(): - await test_model(DIAL_DEPLOYMENT) + await test_model("ollama-model") if __name__ == "__main__": From 20008a41a275395b303df60a7f5cc8af0f1646fc Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Mon, 16 Sep 2024 16:56:54 +0000 Subject: [PATCH 05/16] chore: bumped the default Ollama model to Llama 3.1 --- dial-docker-compose/ollama/docker-compose.yml | 2 +- docs/tutorials/quick-start-with-self-hosted-model.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dial-docker-compose/ollama/docker-compose.yml b/dial-docker-compose/ollama/docker-compose.yml index cbbc914e..7e1e8ab2 100644 --- a/dial-docker-compose/ollama/docker-compose.yml +++ b/dial-docker-compose/ollama/docker-compose.yml @@ -17,7 +17,7 @@ services: image: alpine:3.20.3 environment: - OLLAMA_URL=http://ollama:11434 - - OLLAMA_MODEL=${OLLAMA_MODEL:-llama3:8b-instruct-q2_K} + - OLLAMA_MODEL=${OLLAMA_MODEL:-llama3.1:8b-instruct-q4_K_M} volumes: - ./ollama_setup.sh:/setup.sh command: sh /setup.sh diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index 7c9bdce3..92282fb7 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -18,7 +18,7 @@ Docker engine installed on your machine (Docker Compose Version 2.20.0 +). Run `docker compose up` from the folder with the [docker-compose file](https://github.com/epam/ai-dial/blob/main/dial-docker-compose/ollama/docker-compose.yml). -> By default a lightweight [llama3:8b-instruct-q2_K](https://ollama.com/library/llama3:8b-instruct-q2_K) model will be pulled and loaded into the memory of the Ollama server automatically. +> By default [llama3.1:8b-instruct-q4_K_M](https://ollama.com/library/llama3.1:8b-instruct-q4_K_M) model will be pulled and loaded into the memory of the Ollama server automatically. > > You could specify the model via the environment variable `OLLAMA_MODEL`: > From 626b1d634dcaffa821e0a28c7b068a08297b024d Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Tue, 17 Sep 2024 09:25:52 +0000 Subject: [PATCH 06/16] chore: fixed doc --- docs/tutorials/quick-start-with-self-hosted-model.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index 92282fb7..260b1dce 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -18,7 +18,7 @@ Docker engine installed on your machine (Docker Compose Version 2.20.0 +). Run `docker compose up` from the folder with the [docker-compose file](https://github.com/epam/ai-dial/blob/main/dial-docker-compose/ollama/docker-compose.yml). -> By default [llama3.1:8b-instruct-q4_K_M](https://ollama.com/library/llama3.1:8b-instruct-q4_K_M) model will be pulled and loaded into the memory of the Ollama server automatically. +> By default [llama3.1:8b-instruct-q4_K_M](https://ollama.com/library/llama3.1:8b-instruct-q4_K_M) model will be pulled and loaded into the memory of the Ollama server automatically. This is going to take around a minute on the first run. > > You could specify the model via the environment variable `OLLAMA_MODEL`: > From c51418d48c94258cad30ecb0a723d604ce08cf1b Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Tue, 17 Sep 2024 17:25:00 +0000 Subject: [PATCH 07/16] feat: added instructions for self-hosted vision models --- dial-docker-compose/ci/ollama/test/app.py | 59 ++++++++++-- dial-docker-compose/ci/ollama/test/image.png | Bin 0 -> 1976 bytes dial-docker-compose/ollama/core/config.json | 24 ++++- dial-docker-compose/ollama/docker-compose.yml | 16 +++- dial-docker-compose/ollama/ollama_setup.sh | 24 ----- .../ollama/ollama_setup/Dockerfile | 4 + .../ollama/ollama_setup/setup.py | 87 ++++++++++++++++++ .../quick-start-with-self-hosted-model.md | 63 ++++++++++--- 8 files changed, 225 insertions(+), 52 deletions(-) create mode 100644 dial-docker-compose/ci/ollama/test/image.png delete mode 100755 dial-docker-compose/ollama/ollama_setup.sh create mode 100644 dial-docker-compose/ollama/ollama_setup/Dockerfile create mode 100755 dial-docker-compose/ollama/ollama_setup/setup.py diff --git a/dial-docker-compose/ci/ollama/test/app.py b/dial-docker-compose/ci/ollama/test/app.py index d3c133d1..6f805f43 100644 --- a/dial-docker-compose/ci/ollama/test/app.py +++ b/dial-docker-compose/ci/ollama/test/app.py @@ -1,9 +1,13 @@ +import base64 import os +from pathlib import Path import aiohttp import asyncio import backoff import logging +import time +from contextlib import asynccontextmanager def get_env(name: str) -> str: @@ -21,6 +25,15 @@ def get_env(name: str) -> str: log = logging.getLogger(__name__) +@asynccontextmanager +async def timer(name: str): + log.debug(f"[{name}] Starting...") + start = time.perf_counter() + yield + elapsed = time.perf_counter() - start + log.debug(f"[{name}] Executed in {elapsed:.2f} seconds") + + @backoff.on_exception( backoff.expo, (aiohttp.ClientError, aiohttp.ServerTimeoutError), @@ -35,13 +48,15 @@ async def post_with_retry(url: str, payload: dict, headers: dict, params: dict): return await response.json() -async def test_model(deployment_id: str): +def read_image_base64(png_file: Path) -> str: + return base64.b64encode(png_file.read_bytes()).decode("utf-8") + +async def dial_chat_completion(deployment_id: str, messages: list) -> str: api_url = f"{DIAL_URL}/openai/deployments/{deployment_id}/chat/completions" - message = "12 + 23 = ? Reply with a single number:" payload = { "model": deployment_id, - "messages": [{"role": "user", "content": message}], + "messages": messages, "stream": False, } headers = {"api-key": DIAL_API_KEY} @@ -52,12 +67,44 @@ async def test_model(deployment_id: str): content = body.get("choices", [])[0].get("message", {}).get("content", "") - if "35" not in content: - raise ValueError(f"Test failed for {deployment_id!r}. ") + log.debug(f"Content: {content}") + + return content + +async def test_chat_model(deployment_id: str): + message = "2 + 3 = ? Reply with a single number:" + messages = [{"role": "user", "content": message}] + content = await dial_chat_completion(deployment_id, messages) + + if "5" not in content: + raise ValueError(f"Test failed for {deployment_id!r}") +async def test_vision_model(deployment_id: str): + base64_data = read_image_base64(Path("./image.png")) + base64_image = f"data:image/png;base64,{base64_data}" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe the image"}, + {"type": "image_url", "image_url": {"url": base64_image}}, + ], + } + ] + + content = await dial_chat_completion(deployment_id, messages) + + if "vision" not in content.lower(): + raise ValueError(f"Test failed for {deployment_id!r}") + async def tests(): - await test_model("ollama-model") + async with timer("Testing chat-model"): + await test_chat_model("chat-model") + + async with timer("Testing vision-model"): + await test_vision_model("vision-model") if __name__ == "__main__": diff --git a/dial-docker-compose/ci/ollama/test/image.png b/dial-docker-compose/ci/ollama/test/image.png new file mode 100644 index 0000000000000000000000000000000000000000..bfbcfc7ed9307e4b17e48afc36567949da21c92b GIT binary patch literal 1976 zcmV;p2S@mcP)o5Zh(U2dqlUP~xS=r`)VNh}NsK(WMvcC>CTffi8Xt)90r!AL zj2fe2lt>5|t3ecT!7WjuAVMn$sC@qe_wQ}T>7ACJ+M)lP%){6FoqOi?&gs1~cW!k= z1PvNG8h+kxfCk@$W-JXJfo3cX9)V^o4IY7JEDaukW-JXJfo3cX9)V^o4IY7JEDat( zfVUgq0kpK>^l1QCz8ryp>ecjZkndssd;$od1q)QI>C@1W@4YKD8qv_e0;^TMnm!F* zyvLe0jRn)Et6I~i;fo_yb2Cnzf=-9!%V996SJS8Ai^kHR0{G%os6hqr#pMPU@%b}~ zi}C(FtX2dEBPt5x#-Vrb4p4sc24*v=t5H{n&`?A~AUPSavF(wnuSZc4UcZLLg5Y2b z9Efq_5FXwEjr26U{8TDIN(y_b@7k3t7G?gPJ&do@(c8CD5oh17tc+4q$ z8!J7Xji8GcQ(hjK&2;xJ?b$=UdU0Apf@u8nHdJOTBFe~M9TyNlU%rT#J9aQ;NC-7H z@^XO*OeUVn&6Q@iZ|B(w6C6?kO^bzMV|jMdCTeb$a+;fI(UCODdau^r0vIO1ZU8v7k_wVzLJAPcm95|5C)~N z1qDuN;y04ryR)FBMfr^0g13iECrC|Yqy4a9GTDLxmeuQJZtw+D;nSzor3=sc`_tUH zbnYD0)F`yanPx%)E9}w3)vp#F&L8XSVbc!?pq3Vq!q~BlsnfZ7(!UN>j##i8he}IP zUM|hr=D|605E0>qY+@p=UPaHI?Bnm>xN-$cmY{cUj31AqN8O%Ws;gNL8R=@3K7CkF zQzK94-yeFtNTF>w2cY8jylOR;y<;S^;jdq~c#*|!&4X>2Iu+&R*tii!BQN~@8-;~f zzaD-2V*7S{|L&@q#lix6H8FyNSx{fEK! zMP;Qjzp|3w#@XunB2;WF*jXnh2d%9DaN`E*>Hr`m1@ZCjsXcy-!a@`lqNqroFlG$0 zviQgRd}Yl^lX!gTk}}_RcL-qOL}h;Gg|{C|g{Z5eu3g!qb$&h((V|7{<(2oSC`JwnqOV^avNnCe`v!4i)C6^pSRo;p zH;={V&!e#s*RBD8(TIf$-Bz1AmB;oYlRau{dF;x5y6|v*FZljFwr&-Xw{GPx5>~J7 z9Q*a!4R1SJ3HkZ#rx_Yb+1V`4$PiU=_Jgjkr|#YP%Icqc2U=Tc%^F_!*fC|zHrbdM zp53xVes{y>&72bxL-qAij&F@v>TMZAYb(EOw%thg?~AGk`rUpbp`RZuTSk{Ik=aZI z1(cOV$;mt$73Fa4#n-Z=#9?3Fph2{MKi$4fW;5NsP5bsa?8gcXrIHdE5zSbC8@qNn zy!nrhSKy2L{lEd4P05m@Uq31>bwiVgN=y091;G&=EmgL`w~VFkd#pC3r}Gsi*zJ92 zAGU2naWQ7jlwU6D)eGCUp{xu^N$$xeC844MhYmUY(?o164jn>8h1<%V6aVA?6;@ZH zqy%r@!eT*SAR;0#bSQ=o?|}8m*RS#Dk;A`qMvg>WTt_sz)8HR%fN8`vOX}MI0000< KMNUMnLSTX| literal 0 HcmV?d00001 diff --git a/dial-docker-compose/ollama/core/config.json b/dial-docker-compose/ollama/core/config.json index 1a83c364..4a55bd1e 100644 --- a/dial-docker-compose/ollama/core/config.json +++ b/dial-docker-compose/ollama/core/config.json @@ -1,10 +1,25 @@ { "routes": {}, "models": { - "ollama-model": { + "chat-model": { "type": "chat", - "displayName": "Self-hosted model", + "displayName": "Self-hosted chat model", "endpoint": "http://ollama:11434/v1/chat/completions" + }, + "vision-model": { + "type": "chat", + "displayName": "Self-hosted vision model", + "endpoint": "http://adapter-openai:5000/openai/deployments/vision-model/chat/completions", + "inputAttachmentTypes": [ + "image/png", + "image/jpeg" + ], + "upstreams": [ + { + "endpoint": "http://ollama:11434/v1/chat/completions", + "key": "dummy-key" + } + ] } }, "keys": { @@ -16,8 +31,9 @@ "roles": { "default": { "limits": { - "ollama-model": {} + "chat-model": {}, + "vision-model": {} } } } -} +} \ No newline at end of file diff --git a/dial-docker-compose/ollama/docker-compose.yml b/dial-docker-compose/ollama/docker-compose.yml index 7e1e8ab2..9712aa6a 100644 --- a/dial-docker-compose/ollama/docker-compose.yml +++ b/dial-docker-compose/ollama/docker-compose.yml @@ -14,15 +14,21 @@ services: depends_on: ollama: condition: service_started - image: alpine:3.20.3 + build: ./ollama_setup environment: - OLLAMA_URL=http://ollama:11434 - - OLLAMA_MODEL=${OLLAMA_MODEL:-llama3.1:8b-instruct-q4_K_M} - volumes: - - ./ollama_setup.sh:/setup.sh - command: sh /setup.sh + - OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL} + - OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL} healthcheck: test: ["CMD", "test", "-f", "/healthy"] interval: 10s start_period: 10s retries: 10 + + adapter-openai: + image: epam/ai-dial-adapter-openai:0.14.0 + environment: + WEB_CONCURRENCY: "3" + DIAL_URL: "http://core:8080" + DIAL_USE_FILE_STORAGE: "True" + GPT4_VISION_DEPLOYMENTS: "vision-model" \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup.sh b/dial-docker-compose/ollama/ollama_setup.sh deleted file mode 100755 index d45c4c89..00000000 --- a/dial-docker-compose/ollama/ollama_setup.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -set -e - -apk add --no-cache curl - -until curl -s ${OLLAMA_URL}; do - echo "Waiting for Ollama..." - sleep 5 -done - -echo "Pulling $OLLAMA_URL..." -curl -vL --fail-with-body "$OLLAMA_URL/api/pull" -d "{\"name\": \"$OLLAMA_MODEL\", \"stream\": false}" - -OLLAMA_MODEL_ALIAS=ollama-model - -echo "Making alias for $OLLAMA_URL: $OLLAMA_MODEL_ALIAS..." -curl -vL --fail-with-body "$OLLAMA_URL/api/copy" -d "{\"source\": \"$OLLAMA_MODEL\", \"destination\": \"$OLLAMA_MODEL_ALIAS\"}" - -echo "Loading the model into memory..." -curl -vL --fail-with-body "$OLLAMA_URL/api/generate" -d "{\"model\": \"$OLLAMA_MODEL_ALIAS\"}" - -touch /healthy - -tail -f /dev/null \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/Dockerfile b/dial-docker-compose/ollama/ollama_setup/Dockerfile new file mode 100644 index 00000000..3afdb789 --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/Dockerfile @@ -0,0 +1,4 @@ +FROM python:3.11-alpine +RUN pip install requests +COPY setup.py /setup.py +CMD ["sh", "-c", "python /setup.py && tail -f /dev/null"] diff --git a/dial-docker-compose/ollama/ollama_setup/setup.py b/dial-docker-compose/ollama/ollama_setup/setup.py new file mode 100755 index 00000000..85e26858 --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/setup.py @@ -0,0 +1,87 @@ +import asyncio +from contextlib import asynccontextmanager +import logging +import os +import time +import requests + +OLLAMA_URL = os.getenv("OLLAMA_URL") +if OLLAMA_URL is None: + raise RuntimeError("OLLAMA_URL env var isn't set") + +OLLAMA_CHAT_MODEL = os.getenv("OLLAMA_CHAT_MODEL") +OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL") + +HEALTH_FILE = "/healthy" + +log = logging.getLogger(__name__) +log.setLevel(logging.INFO) + +log.info(f"OLLAMA_URL = {OLLAMA_URL}") +log.info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}") +log.info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}") + + +@asynccontextmanager +async def timer(name: str): + log.info(f"[{name}] Starting...") + start = time.perf_counter() + yield + elapsed = time.perf_counter() - start + log.info(f"[{name}] Executed in {elapsed:.2f} seconds") + + +async def wait_for_ollama(): + while True: + try: + if requests.get(OLLAMA_URL).ok: + break + except requests.RequestException: + pass + await asyncio.sleep(1) + + +async def pull_model(model): + data = {"name": model, "stream": False} + requests.post(f"{OLLAMA_URL}/api/pull", json=data).raise_for_status() + + +async def create_alias(source, dest): + data = {"source": source, "destination": dest} + requests.post(f"{OLLAMA_URL}/api/copy", json=data).raise_for_status() + + +async def load_model(model): + data = {"model": model} + requests.post(f"{OLLAMA_URL}/api/generate", json=data).raise_for_status() + + +async def mark_as_healthy(): + open(HEALTH_FILE, "w").close() + + +async def main(): + async with timer("Waiting for Ollama to start"): + await wait_for_ollama() + + for model, alias in [ + (OLLAMA_CHAT_MODEL, "chat-model"), + (OLLAMA_VISION_MODEL, "vision-model"), + ]: + if model: + async with timer(f"Pulling model {model}"): + await pull_model(model) + + async with timer(f"Creating alias for {model}: {alias}"): + await create_alias(model, alias) + + if model_to_load := OLLAMA_CHAT_MODEL or OLLAMA_VISION_MODEL: + async with timer(f"Loading model {model_to_load} into memory"): + await load_model(model_to_load) + + await mark_as_healthy() + + +if __name__ == "__main__": + loop = asyncio.get_event_loop() + loop.run_until_complete(main()) diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index 260b1dce..c850eec3 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -12,20 +12,57 @@ Docker engine installed on your machine (Docker Compose Version 2.20.0 +). ## Step 1: Get AI DIAL -[Download](https://github.com/epam/ai-dial/tree/main/dial-docker-compose/ollama/) AI DIAL. +Clone [the repository](https://github.com/epam/ai-dial/) with the tutorials and change directory to the following folder: -## Step 2: Launch AI DIAL Chat +```sh +cd dial-docker-compose/ollama +``` -Run `docker compose up` from the folder with the [docker-compose file](https://github.com/epam/ai-dial/blob/main/dial-docker-compose/ollama/docker-compose.yml). +## Step 2: Choose a model to run -> By default [llama3.1:8b-instruct-q4_K_M](https://ollama.com/library/llama3.1:8b-instruct-q4_K_M) model will be pulled and loaded into the memory of the Ollama server automatically. This is going to take around a minute on the first run. -> -> You could specify the model via the environment variable `OLLAMA_MODEL`: -> -> ```sh -> OLLAMA_MODEL=model_of_your_choice docker compose up -> ``` -> -> Find the available models at the [Ollama model library](https://ollama.com/library). +Ollama supports a wide range of popular open-source models. -Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and chat with the model. \ No newline at end of file +Consider first the modality your interested in - is a regular text-to-text chat model or a multi-modal vision model? + +Follow the feature tags at https://ollama.com/search to find an appropriate model, e.g. one working with `Code`, supporting `Tools` or a `Vision` feature. + +We recommend to choose one of the following models which have been tested: + +|Model|Vision|Tools|Streaming| +|----|----|----|----| +|[llama3.1:8b-instruct-q4_0](https://ollama.com/library/llama3.1:8b-instruct-q4_0)|❌|✅ *(only in non-streaming mode)*|✅| +|[mistral:7b-instruct-q4_0](https://ollama.com/library/mistral:7b-instruct-q4_0)|❌|❌|✅| +|[phi3.5:3.8b-mini-instruct-q4_0](https://ollama.com/library/phi3.5:3.8b-mini-instruct-q4_0)|❌|❌|✅| +|[gemma2:2b-instruct-q4_0](https://ollama.com/library/gemma2:2b-instruct-q4_0)|❌|❌|✅| +|[llava:7b-v1.6-mistral-q4_0](https://ollama.com/library/llava:7b-v1.6-mistral-q4_0)|✅|❌|❌| +|[llava-phi3:3.8b-mini-q4_0](https://ollama.com/library/llava-phi3:3.8b-mini-q4_0)|✅|❌|❌| + +All the listed models support streaming. + +## Step 3: Launch AI DIAL Chat + +### Chat model + +If you have chosen a regular chat model _(e.g. llama3.1:8b-instruct-q4_0)_, then run the command: + +```sh +OLLAMA_CHAT_MODEL=model_of_your_choice docker compose up --abort-on-container-exit +``` + +The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run. + +Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted chat model` deployment to converse with the model. + +### Vision model + +If you have chosen a vision model _(e.g. llava-phi3:3.8b-mini-q4_0)_, then run the command: + +```sh +OLLAMA_VISION_MODEL=model_of_your_choice docker compose up --abort-on-container-exit +``` + +The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run. + +Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted vision model` deployment to converse with the model. + +> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond. \ No newline at end of file From 9003333e1cb224921c35ea715e3f8dfa5ee871de Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Tue, 17 Sep 2024 17:28:16 +0000 Subject: [PATCH 08/16] fix: updated env for ollama ci test --- dial-docker-compose/ci/ollama/.env | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dial-docker-compose/ci/ollama/.env b/dial-docker-compose/ci/ollama/.env index 4daa1416..58a1c316 100644 --- a/dial-docker-compose/ci/ollama/.env +++ b/dial-docker-compose/ci/ollama/.env @@ -1 +1,3 @@ -DIAL_DIR="./ollama" \ No newline at end of file +DIAL_DIR="./ollama" +OLLAMA_CHAT_MODEL=llama3.1:8b-instruct-q4_0 +OLLAMA_VISION_MODEL=llava-phi3:3.8b-mini-q4_0 \ No newline at end of file From 4808b0958dc669ce7e18ba9c07d331445781780f Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Wed, 18 Sep 2024 08:33:01 +0000 Subject: [PATCH 09/16] chore: divided a single quickstart ci job into separate jobs --- .github/workflows/pr.yml | 32 ++++++++++++++++++++++++++------ .github/workflows/release.yml | 33 +++++++++++++++++++++++++++------ 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 06e5d3ce..0605935f 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -18,8 +18,8 @@ jobs: cwd: "./dial-cookbook/ci" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" - run-quickstart: - name: Run quickstart examples + run-quickstart-model: + name: Run quickstart model example runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -28,24 +28,44 @@ jobs: with: cwd: "./dial-docker-compose/ci/model" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-application: + name: Run quickstart application example + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart application example with: cwd: "./dial-docker-compose/ci/application" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-addon: + name: Run quickstart addon example + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart addon example with: cwd: "./dial-docker-compose/ci/addon" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-self-hosted-model: + name: Run quickstart self-hosted model example + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart ollama example with: cwd: "./dial-docker-compose/ci/ollama" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" build: - needs: [run-notebooks, run-quickstart] + needs: + - run-notebooks + - run-quickstart-model + - run-quickstart-application + - run-quickstart-addon + - run-quickstart-self-hosted-model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9b7f475e..a6677137 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -18,8 +18,9 @@ jobs: cwd: "./dial-cookbook/ci" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" - run-quickstart: - name: Run quickstart examples + + run-quickstart-model: + name: Run quickstart model example runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -28,24 +29,44 @@ jobs: with: cwd: "./dial-docker-compose/ci/model" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-application: + name: Run quickstart application example + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart application example with: cwd: "./dial-docker-compose/ci/application" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-addon: + name: Run quickstart addon example + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart addon example with: cwd: "./dial-docker-compose/ci/addon" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-self-hosted-model: + name: Run quickstart self-hosted model example + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart ollama example with: cwd: "./dial-docker-compose/ci/ollama" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" build-and-deploy: - needs: [run-notebooks, run-quickstart] + needs: + - run-notebooks + - run-quickstart-model + - run-quickstart-application + - run-quickstart-addon + - run-quickstart-self-hosted-model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 From 64f4da8327d251d8930ae59a0e0c6a2c66e01607 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Wed, 18 Sep 2024 08:39:38 +0000 Subject: [PATCH 10/16] chore: simplified names of the ci jobs --- .github/workflows/pr.yml | 10 +++++----- .github/workflows/release.yml | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 0605935f..86f5ceb3 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -7,7 +7,7 @@ on: jobs: run-notebooks: - name: Run example Python notebooks + name: Cookbook notebooks runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -19,7 +19,7 @@ jobs: up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" run-quickstart-model: - name: Run quickstart model example + name: Quickstart model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -30,7 +30,7 @@ jobs: up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" run-quickstart-application: - name: Run quickstart application example + name: Quickstart application runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -40,7 +40,7 @@ jobs: up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" run-quickstart-addon: - name: Run quickstart addon example + name: Quickstart addon runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -50,7 +50,7 @@ jobs: up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" run-quickstart-self-hosted-model: - name: Run quickstart self-hosted model example + name: Quickstart self-hosted model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a6677137..e3828138 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -7,7 +7,7 @@ on: jobs: run-notebooks: - name: Run example Python notebooks + name: Cookbook notebooks runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -20,7 +20,7 @@ jobs: run-quickstart-model: - name: Run quickstart model example + name: Quickstart model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -31,7 +31,7 @@ jobs: up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" run-quickstart-application: - name: Run quickstart application example + name: Quickstart application runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -41,7 +41,7 @@ jobs: up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" run-quickstart-addon: - name: Run quickstart addon example + name: Quickstart addon runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -51,7 +51,7 @@ jobs: up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" run-quickstart-self-hosted-model: - name: Run quickstart self-hosted model example + name: Quickstart self-hosted model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 From 11fa0ac4a7b51e6c299d1605bcd14dac59dce60b Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Wed, 18 Sep 2024 08:59:35 +0000 Subject: [PATCH 11/16] fix: migrated setup.py script for ollama to httpx --- .../ollama/ollama_setup/Dockerfile | 9 +++-- .../ollama/ollama_setup/requirements.txt | 1 + .../ollama/ollama_setup/setup.py | 36 ++++++++++--------- 3 files changed, 27 insertions(+), 19 deletions(-) create mode 100644 dial-docker-compose/ollama/ollama_setup/requirements.txt diff --git a/dial-docker-compose/ollama/ollama_setup/Dockerfile b/dial-docker-compose/ollama/ollama_setup/Dockerfile index 3afdb789..91b223bf 100644 --- a/dial-docker-compose/ollama/ollama_setup/Dockerfile +++ b/dial-docker-compose/ollama/ollama_setup/Dockerfile @@ -1,4 +1,7 @@ FROM python:3.11-alpine -RUN pip install requests -COPY setup.py /setup.py -CMD ["sh", "-c", "python /setup.py && tail -f /dev/null"] + +WORKDIR /app +COPY * /app +RUN pip install -r requirements.txt + +CMD ["sh", "-c", "python setup.py && tail -f /dev/null"] diff --git a/dial-docker-compose/ollama/ollama_setup/requirements.txt b/dial-docker-compose/ollama/ollama_setup/requirements.txt new file mode 100644 index 00000000..0329bccd --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/requirements.txt @@ -0,0 +1 @@ +httpx==0.27.2 \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/setup.py b/dial-docker-compose/ollama/ollama_setup/setup.py index 85e26858..cf5aa3b3 100755 --- a/dial-docker-compose/ollama/ollama_setup/setup.py +++ b/dial-docker-compose/ollama/ollama_setup/setup.py @@ -1,9 +1,8 @@ import asyncio from contextlib import asynccontextmanager -import logging import os import time -import requests +import httpx OLLAMA_URL = os.getenv("OLLAMA_URL") if OLLAMA_URL is None: @@ -14,46 +13,51 @@ HEALTH_FILE = "/healthy" -log = logging.getLogger(__name__) -log.setLevel(logging.INFO) -log.info(f"OLLAMA_URL = {OLLAMA_URL}") -log.info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}") -log.info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}") +def print_info(*args, **kwargs): + print(*args, **kwargs, flush=True) + + +print_info(f"OLLAMA_URL = {OLLAMA_URL}") +print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}") +print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}") @asynccontextmanager async def timer(name: str): - log.info(f"[{name}] Starting...") + print_info(f"[{name}] Starting...") start = time.perf_counter() yield elapsed = time.perf_counter() - start - log.info(f"[{name}] Executed in {elapsed:.2f} seconds") + print_info(f"[{name}] Executed in {elapsed:.2f} seconds") + + +ollama = httpx.AsyncClient(base_url=OLLAMA_URL, timeout=300) -async def wait_for_ollama(): +async def wait_for_startup(): while True: try: - if requests.get(OLLAMA_URL).ok: + if (await ollama.get("/")).is_success: break - except requests.RequestException: + except Exception: pass await asyncio.sleep(1) async def pull_model(model): data = {"name": model, "stream": False} - requests.post(f"{OLLAMA_URL}/api/pull", json=data).raise_for_status() + (await ollama.post("/api/pull", json=data)).raise_for_status() async def create_alias(source, dest): data = {"source": source, "destination": dest} - requests.post(f"{OLLAMA_URL}/api/copy", json=data).raise_for_status() + (await ollama.post(f"/api/copy", json=data)).raise_for_status() async def load_model(model): data = {"model": model} - requests.post(f"{OLLAMA_URL}/api/generate", json=data).raise_for_status() + (await ollama.post(f"/api/generate", json=data)).raise_for_status() async def mark_as_healthy(): @@ -62,7 +66,7 @@ async def mark_as_healthy(): async def main(): async with timer("Waiting for Ollama to start"): - await wait_for_ollama() + await wait_for_startup() for model, alias in [ (OLLAMA_CHAT_MODEL, "chat-model"), From 517b1fae7d2d5617342e2279b5b7d3f6e06ca234 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Wed, 18 Sep 2024 09:54:25 +0000 Subject: [PATCH 12/16] feat: added ci test for self-hosted embedding model --- dial-docker-compose/ci/ollama/.env | 3 +- dial-docker-compose/ci/ollama/test/app.py | 29 +++++++++++ dial-docker-compose/ollama/core/config.json | 7 ++- dial-docker-compose/ollama/docker-compose.yml | 1 + .../ollama/ollama_setup/setup.py | 7 ++- .../quick-start-with-self-hosted-model.md | 48 ++++++++++++++----- 6 files changed, 78 insertions(+), 17 deletions(-) diff --git a/dial-docker-compose/ci/ollama/.env b/dial-docker-compose/ci/ollama/.env index 58a1c316..15ace1c5 100644 --- a/dial-docker-compose/ci/ollama/.env +++ b/dial-docker-compose/ci/ollama/.env @@ -1,3 +1,4 @@ DIAL_DIR="./ollama" OLLAMA_CHAT_MODEL=llama3.1:8b-instruct-q4_0 -OLLAMA_VISION_MODEL=llava-phi3:3.8b-mini-q4_0 \ No newline at end of file +OLLAMA_VISION_MODEL=llava-phi3:3.8b-mini-q4_0 +OLLAMA_EMBEDDING_MODEL=nomic-embed-text:137m-v1.5-fp16 \ No newline at end of file diff --git a/dial-docker-compose/ci/ollama/test/app.py b/dial-docker-compose/ci/ollama/test/app.py index 6f805f43..66e49085 100644 --- a/dial-docker-compose/ci/ollama/test/app.py +++ b/dial-docker-compose/ci/ollama/test/app.py @@ -1,6 +1,7 @@ import base64 import os from pathlib import Path +from typing import Any import aiohttp import asyncio import backoff @@ -71,6 +72,25 @@ async def dial_chat_completion(deployment_id: str, messages: list) -> str: return content +async def dial_embeddings(deployment_id: str, input: Any) -> str: + api_url = f"{DIAL_URL}/openai/deployments/{deployment_id}/embeddings" + + payload = { + "model": deployment_id, + "input": input, + } + headers = {"api-key": DIAL_API_KEY} + params = {"api-version": DIAL_API_VERSION} + + body = await post_with_retry(api_url, payload, headers, params) + log.debug(f"Response: {body}") + + embedding = body.get("data", [])[0].get("embedding", []) + + log.debug(f"Len embedding vector: {len(embedding)}") + + return embedding + async def test_chat_model(deployment_id: str): message = "2 + 3 = ? Reply with a single number:" messages = [{"role": "user", "content": message}] @@ -99,6 +119,13 @@ async def test_vision_model(deployment_id: str): if "vision" not in content.lower(): raise ValueError(f"Test failed for {deployment_id!r}") +async def test_embedding_model(deployment_id: str): + embeddings = await dial_embeddings(deployment_id, "cat") + + if len(embeddings) == 0 or not isinstance(embeddings[0], float): + raise ValueError(f"Test failed for {deployment_id!r}") + + async def tests(): async with timer("Testing chat-model"): await test_chat_model("chat-model") @@ -106,6 +133,8 @@ async def tests(): async with timer("Testing vision-model"): await test_vision_model("vision-model") + async with timer("Testing embedding-model"): + await test_embedding_model("embedding-model") if __name__ == "__main__": loop = asyncio.get_event_loop() diff --git a/dial-docker-compose/ollama/core/config.json b/dial-docker-compose/ollama/core/config.json index 4a55bd1e..e4b370cc 100644 --- a/dial-docker-compose/ollama/core/config.json +++ b/dial-docker-compose/ollama/core/config.json @@ -20,6 +20,10 @@ "key": "dummy-key" } ] + }, + "embedding-model": { + "type": "embedding", + "endpoint": "http://ollama:11434/v1/embeddings" } }, "keys": { @@ -32,7 +36,8 @@ "default": { "limits": { "chat-model": {}, - "vision-model": {} + "vision-model": {}, + "embedding-model": {} } } } diff --git a/dial-docker-compose/ollama/docker-compose.yml b/dial-docker-compose/ollama/docker-compose.yml index 9712aa6a..68e316f0 100644 --- a/dial-docker-compose/ollama/docker-compose.yml +++ b/dial-docker-compose/ollama/docker-compose.yml @@ -19,6 +19,7 @@ services: - OLLAMA_URL=http://ollama:11434 - OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL} - OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL} + - OLLAMA_EMBEDDING_MODEL=${OLLAMA_EMBEDDING_MODEL} healthcheck: test: ["CMD", "test", "-f", "/healthy"] interval: 10s diff --git a/dial-docker-compose/ollama/ollama_setup/setup.py b/dial-docker-compose/ollama/ollama_setup/setup.py index cf5aa3b3..ccc4d358 100755 --- a/dial-docker-compose/ollama/ollama_setup/setup.py +++ b/dial-docker-compose/ollama/ollama_setup/setup.py @@ -10,6 +10,7 @@ OLLAMA_CHAT_MODEL = os.getenv("OLLAMA_CHAT_MODEL") OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL") +OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL") HEALTH_FILE = "/healthy" @@ -21,6 +22,7 @@ def print_info(*args, **kwargs): print_info(f"OLLAMA_URL = {OLLAMA_URL}") print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}") print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}") +print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}") @asynccontextmanager @@ -29,7 +31,7 @@ async def timer(name: str): start = time.perf_counter() yield elapsed = time.perf_counter() - start - print_info(f"[{name}] Executed in {elapsed:.2f} seconds") + print_info(f"[{name}] Finished in {elapsed:.2f} seconds") ollama = httpx.AsyncClient(base_url=OLLAMA_URL, timeout=300) @@ -71,6 +73,7 @@ async def main(): for model, alias in [ (OLLAMA_CHAT_MODEL, "chat-model"), (OLLAMA_VISION_MODEL, "vision-model"), + (OLLAMA_EMBEDDING_MODEL, "embedding-model"), ]: if model: async with timer(f"Pulling model {model}"): @@ -79,7 +82,7 @@ async def main(): async with timer(f"Creating alias for {model}: {alias}"): await create_alias(model, alias) - if model_to_load := OLLAMA_CHAT_MODEL or OLLAMA_VISION_MODEL: + if model_to_load := (OLLAMA_CHAT_MODEL or OLLAMA_VISION_MODEL): async with timer(f"Loading model {model_to_load} into memory"): await load_model(model_to_load) diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index c850eec3..54c96785 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -22,22 +22,32 @@ cd dial-docker-compose/ollama Ollama supports a wide range of popular open-source models. -Consider first the modality your interested in - is a regular text-to-text chat model or a multi-modal vision model? +Consider first the modality your are interested in - is it a regular text-to-text chat model, a multi-modal vision model or an embedding model? -Follow the feature tags at https://ollama.com/search to find an appropriate model, e.g. one working with `Code`, supporting `Tools` or a `Vision` feature. +Follow the feature tags _(`Embeddings`, `Code`, `Tools`, `Vision`)_ at https://ollama.com/search to find an appropriate model. -We recommend to choose one of the following models which have been tested: +We recommend to choose one of the following models which have been tested. -|Model|Vision|Tools|Streaming| -|----|----|----|----| -|[llama3.1:8b-instruct-q4_0](https://ollama.com/library/llama3.1:8b-instruct-q4_0)|❌|✅ *(only in non-streaming mode)*|✅| -|[mistral:7b-instruct-q4_0](https://ollama.com/library/mistral:7b-instruct-q4_0)|❌|❌|✅| -|[phi3.5:3.8b-mini-instruct-q4_0](https://ollama.com/library/phi3.5:3.8b-mini-instruct-q4_0)|❌|❌|✅| -|[gemma2:2b-instruct-q4_0](https://ollama.com/library/gemma2:2b-instruct-q4_0)|❌|❌|✅| -|[llava:7b-v1.6-mistral-q4_0](https://ollama.com/library/llava:7b-v1.6-mistral-q4_0)|✅|❌|❌| -|[llava-phi3:3.8b-mini-q4_0](https://ollama.com/library/llava-phi3:3.8b-mini-q4_0)|✅|❌|❌| +### Chat models -All the listed models support streaming. +|Model|Tools| +|----|----| +|[llama3.1:8b-instruct-q4_0](https://ollama.com/library/llama3.1:8b-instruct-q4_0)|✅ *(only in non-streaming mode)*| +|[mistral:7b-instruct-q4_0](https://ollama.com/library/mistral:7b-instruct-q4_0)|❌| +|[phi3.5:3.8b-mini-instruct-q4_0](https://ollama.com/library/phi3.5:3.8b-mini-instruct-q4_0)|❌| +|[gemma2:2b-instruct-q4_0](https://ollama.com/library/gemma2:2b-instruct-q4_0)|❌| + +All the models support streaming. + +### Vision models + +* [llava:7b-v1.6-mistral-q4_0](https://ollama.com/library/llava:7b-v1.6-mistral-q4_0) +* [llava-phi3:3.8b-mini-q4_0](https://ollama.com/library/llava-phi3:3.8b-mini-q4_0) + +### Embedding models + +* [nomic-embed-text:137m-v1.5-fp16](https://ollama.com/library/nomic-embed-text:137m-v1.5-fp16) +* [bge-m3:567m-fp16](https://ollama.com/library/bge-m3:567m-fp16) ## Step 3: Launch AI DIAL Chat @@ -65,4 +75,16 @@ The model will be pulled and loaded into the memory of the Ollama server automat Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted vision model` deployment to converse with the model. -> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond. \ No newline at end of file +> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond. + +### Embedding model + +If you have chosen an embedding model _(e.g. nomic-embed-text:137m-v1.5-fp16)_, then run the command: + +```sh +OLLAMA_EMBEDDING_MODEL=model_of_your_choice docker compose up --abort-on-container-exit +``` + +The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run. + +The embedding model will become available in DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`. \ No newline at end of file From bdd3536e4911f39a1eed9a67241825f77d4c5724 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Thu, 19 Sep 2024 14:27:20 +0000 Subject: [PATCH 13/16] feat: used .env file instead of env vars in self-hosted model tutorial --- dial-docker-compose/ollama/.env | 5 ++- .../quick-start-with-self-hosted-model.md | 38 +++++++------------ 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/dial-docker-compose/ollama/.env b/dial-docker-compose/ollama/.env index 4daa1416..cdabc6b5 100644 --- a/dial-docker-compose/ollama/.env +++ b/dial-docker-compose/ollama/.env @@ -1 +1,4 @@ -DIAL_DIR="./ollama" \ No newline at end of file +DIAL_DIR="./ollama" +OLLAMA_CHAT_MODEL= +OLLAMA_VISION_MODEL= +OLLAMA_EMBEDDING_MODEL= \ No newline at end of file diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index 54c96785..337e6297 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -51,40 +51,28 @@ All the models support streaming. ## Step 3: Launch AI DIAL Chat -### Chat model +Configure `.env` file in the current directory according to the type of model you've chosen: -If you have chosen a regular chat model _(e.g. llama3.1:8b-instruct-q4_0)_, then run the command: +* Set `OLLAMA_CHAT_MODEL` for the name of a text model. +* Set `OLLAMA_VISION_MODEL` for the name of a vision model. +* Set `OLLAMA_EMBEDDING_MODEL` for the name of an embedding model. -```sh -OLLAMA_CHAT_MODEL=model_of_your_choice docker compose up --abort-on-container-exit -``` - -The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run. - -Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted chat model` deployment to converse with the model. - -### Vision model +It's not necessary to configure all the models. +If a model isn't set, then it won't be downloaded. -If you have chosen a vision model _(e.g. llava-phi3:3.8b-mini-q4_0)_, then run the command: +Then run the command: ```sh -OLLAMA_VISION_MODEL=model_of_your_choice docker compose up --abort-on-container-exit +docker compose up --abort-on-container-exit ``` -The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run. +It will pull and load into the memory of the Ollama server the specified models. -Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select `Self-hosted vision model` deployment to converse with the model. +> Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or dozens of minutes)_ to download them on the first run depending on your Internet bandwidth. -> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond. - -### Embedding model - -If you have chosen an embedding model _(e.g. nomic-embed-text:137m-v1.5-fp16)_, then run the command: - -```sh -OLLAMA_EMBEDDING_MODEL=model_of_your_choice docker compose up --abort-on-container-exit -``` +Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select an appropriate DIAL deployments to converse with: -The model will be pulled and loaded into the memory of the Ollama server automatically. This may take a minute on the first run. +* `Self-hosted chat model` deployment for the `OLLAMA_CHAT_MODEL`, +* `Self-hosted vision model` deployment for the `OLLAMA_VISION_MODEL`, The embedding model will become available in DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`. \ No newline at end of file From 1125e215e8d17b5ecb2c7fa18d9ccb31f51b9ba2 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Thu, 19 Sep 2024 14:47:34 +0000 Subject: [PATCH 14/16] fix: increased timeout in the ollama setup script --- dial-docker-compose/ollama/ollama_setup/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dial-docker-compose/ollama/ollama_setup/setup.py b/dial-docker-compose/ollama/ollama_setup/setup.py index ccc4d358..fdaad8b5 100755 --- a/dial-docker-compose/ollama/ollama_setup/setup.py +++ b/dial-docker-compose/ollama/ollama_setup/setup.py @@ -34,7 +34,7 @@ async def timer(name: str): print_info(f"[{name}] Finished in {elapsed:.2f} seconds") -ollama = httpx.AsyncClient(base_url=OLLAMA_URL, timeout=300) +ollama = httpx.AsyncClient(base_url=OLLAMA_URL, timeout=300000) async def wait_for_startup(): From 036a8409b09c38db5196d510d052785b71be2c75 Mon Sep 17 00:00:00 2001 From: sr-remsha Date: Thu, 19 Sep 2024 18:49:56 +0200 Subject: [PATCH 15/16] review --- docs/README.md | 4 +- .../quick-start-with-self-hosted-model.md | 39 +++++++++---------- sidebars.js | 2 +- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/docs/README.md b/docs/README.md index edde7112..93b9a2de 100644 --- a/docs/README.md +++ b/docs/README.md @@ -34,8 +34,8 @@ * [Launch AI DIAL Chat with an Azure model](./tutorials/quick-start-model.md) * [Launch AI DIAL Chat with a self-hosted model](./tutorials/quick-start-with-self-hosted-model.md) -* [Launch AI DIAL Chat with a Sample Application](./tutorials/quick-start-with-application.md) -* [Launch AI DIAL Chat with a Sample Addon](./tutorials/quick-start-with-addon.md) +* [Launch AI DIAL Chat with a sample application](./tutorials/quick-start-with-application.md) +* [Launch AI DIAL Chat with a sample addon](./tutorials/quick-start-with-addon.md) ## AI DIAL Chat Application User Manual diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index 337e6297..bebdfd37 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -1,4 +1,4 @@ -# Launch AI DIAL Chat with a self-hosted model +# Launch AI DIAL Chat with a Self-Hosted Model ## Introduction @@ -24,9 +24,9 @@ Ollama supports a wide range of popular open-source models. Consider first the modality your are interested in - is it a regular text-to-text chat model, a multi-modal vision model or an embedding model? -Follow the feature tags _(`Embeddings`, `Code`, `Tools`, `Vision`)_ at https://ollama.com/search to find an appropriate model. +Follow the feature tags _(`Embeddings`, `Code`, `Tools`, `Vision`)_ at [Ollama Search](https://ollama.com/search) to find the appropriate model. -We recommend to choose one of the following models which have been tested. +We recommend choosing one of the following models which have been tested. ### Chat models @@ -51,28 +51,25 @@ All the models support streaming. ## Step 3: Launch AI DIAL Chat -Configure `.env` file in the current directory according to the type of model you've chosen: +1. Configure `.env` file in the current directory according to the type of model you've chosen: -* Set `OLLAMA_CHAT_MODEL` for the name of a text model. -* Set `OLLAMA_VISION_MODEL` for the name of a vision model. -* Set `OLLAMA_EMBEDDING_MODEL` for the name of an embedding model. + * Set `OLLAMA_CHAT_MODEL` for the name of a text model. + * Set `OLLAMA_VISION_MODEL` for the name of a vision model. + * Set `OLLAMA_EMBEDDING_MODEL` for the name of an embedding model. + + **Note**: It's not necessary to configure all the models. If a model isn't set, then it won't be downloaded. -It's not necessary to configure all the models. -If a model isn't set, then it won't be downloaded. +2. Then run the following command to pull and load into the memory of the Ollama server the specified models: -Then run the command: + ```sh + docker compose up --abort-on-container-exit + ``` -```sh -docker compose up --abort-on-container-exit -``` - -It will pull and load into the memory of the Ollama server the specified models. - -> Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or dozens of minutes)_ to download them on the first run depending on your Internet bandwidth. + > Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or dozens of minutes)_ to download them on the first run depending on your Internet bandwidth. -Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select an appropriate DIAL deployments to converse with: +3. Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select an appropriate AI DIAL deployments to converse with: -* `Self-hosted chat model` deployment for the `OLLAMA_CHAT_MODEL`, -* `Self-hosted vision model` deployment for the `OLLAMA_VISION_MODEL`, + * `Self-hosted chat model` deployment for the `OLLAMA_CHAT_MODEL` + * `Self-hosted vision model` deployment for the `OLLAMA_VISION_MODEL` -The embedding model will become available in DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`. \ No newline at end of file +The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`. \ No newline at end of file diff --git a/sidebars.js b/sidebars.js index eba6d6dd..dc81800b 100644 --- a/sidebars.js +++ b/sidebars.js @@ -105,7 +105,7 @@ const sidebars = { { type: 'doc', id: 'tutorials/quick-start-with-self-hosted-model', - label: 'Chat with a self-hosted model', + label: 'Chat with a Self-Hosted Model', }, { type: 'doc', From 3c3e4c81bfc3d0ab9c2577a5287d84642970d9cf Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Thu, 19 Sep 2024 16:59:46 +0000 Subject: [PATCH 16/16] feat: added progress bar for model downloading --- dial-docker-compose/ollama/.env.example | 3 + .../ollama/ollama_setup/requirements.txt | 4 +- .../ollama/ollama_setup/setup.py | 81 +++++++++++++------ .../quick-start-with-self-hosted-model.md | 8 +- 4 files changed, 69 insertions(+), 27 deletions(-) create mode 100644 dial-docker-compose/ollama/.env.example diff --git a/dial-docker-compose/ollama/.env.example b/dial-docker-compose/ollama/.env.example new file mode 100644 index 00000000..06216195 --- /dev/null +++ b/dial-docker-compose/ollama/.env.example @@ -0,0 +1,3 @@ +OLLAMA_CHAT_MODEL=llama3.1:8b-instruct-q4_0 +OLLAMA_VISION_MODEL=llava-phi3:3.8b-mini-q4_0 +OLLAMA_EMBEDDING_MODEL=bge-m3:567m-fp16 \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/requirements.txt b/dial-docker-compose/ollama/ollama_setup/requirements.txt index 0329bccd..ac6a93f1 100644 --- a/dial-docker-compose/ollama/ollama_setup/requirements.txt +++ b/dial-docker-compose/ollama/ollama_setup/requirements.txt @@ -1 +1,3 @@ -httpx==0.27.2 \ No newline at end of file +httpx==0.27.2 +tqdm==4.66.5 +ollama==0.3.3 \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/setup.py b/dial-docker-compose/ollama/ollama_setup/setup.py index fdaad8b5..3e27663e 100755 --- a/dial-docker-compose/ollama/ollama_setup/setup.py +++ b/dial-docker-compose/ollama/ollama_setup/setup.py @@ -1,8 +1,11 @@ import asyncio from contextlib import asynccontextmanager import os +import sys import time -import httpx +import asyncio +from ollama import AsyncClient +from tqdm import tqdm OLLAMA_URL = os.getenv("OLLAMA_URL") if OLLAMA_URL is None: @@ -15,9 +18,19 @@ HEALTH_FILE = "/healthy" -def print_info(*args, **kwargs): - print(*args, **kwargs, flush=True) +class Writer: + @classmethod + def write(cls, s: str): + # NOTE: every tqdm progress bar update is deliberately ended with "\n", + # otherwise one wouldn't see the bar running in console upon running `docker compose up`. + print(s, file=sys.stdout, flush=True, end="\n") + + @classmethod + def flush(cls): + sys.stdout.flush() + +print_info = Writer.write print_info(f"OLLAMA_URL = {OLLAMA_URL}") print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}") @@ -34,39 +47,57 @@ async def timer(name: str): print_info(f"[{name}] Finished in {elapsed:.2f} seconds") -ollama = httpx.AsyncClient(base_url=OLLAMA_URL, timeout=300000) - - async def wait_for_startup(): + attempt = 0 while True: + attempt += 1 try: - if (await ollama.get("/")).is_success: - break + await AsyncClient(host=OLLAMA_URL, timeout=5).ps() except Exception: - pass - await asyncio.sleep(1) + print_info(f"[{attempt:>3}] Waiting for Ollama to start...") + await asyncio.sleep(5) + else: + break -async def pull_model(model): - data = {"name": model, "stream": False} - (await ollama.post("/api/pull", json=data)).raise_for_status() +async def pull_model(client: AsyncClient, model: str): + response = await client.pull(model, stream=True) + progress_bar = None + prev_status = None -async def create_alias(source, dest): - data = {"source": source, "destination": dest} - (await ollama.post(f"/api/copy", json=data)).raise_for_status() + async for chunk in response: + status = chunk["status"] + total = chunk.get("total") + completed = chunk.get("completed") + if status != prev_status and total: + prev_status = status + if progress_bar: + progress_bar.close() + progress_bar = tqdm( + total=total, unit="B", unit_scale=True, desc=f"[{status}]", file=Writer + ) -async def load_model(model): - data = {"model": model} - (await ollama.post(f"/api/generate", json=data)).raise_for_status() + if completed and progress_bar and total: + progress_bar.n = completed + progress_bar.set_description(f"[{status}]") + progress_bar.refresh() + if total and total == completed and progress_bar: + progress_bar.close() -async def mark_as_healthy(): + if not completed and not total: + print_info(f"[{status}]") + + +async def create_health_mark(): open(HEALTH_FILE, "w").close() async def main(): + client = AsyncClient(host=OLLAMA_URL, timeout=300000) + async with timer("Waiting for Ollama to start"): await wait_for_startup() @@ -77,16 +108,18 @@ async def main(): ]: if model: async with timer(f"Pulling model {model}"): - await pull_model(model) + await pull_model(client, model) async with timer(f"Creating alias for {model}: {alias}"): - await create_alias(model, alias) + await client.copy(model, alias) if model_to_load := (OLLAMA_CHAT_MODEL or OLLAMA_VISION_MODEL): async with timer(f"Loading model {model_to_load} into memory"): - await load_model(model_to_load) + await client.generate(model_to_load) + + await create_health_mark() - await mark_as_healthy() + print_info("The Ollama server is up and running.") if __name__ == "__main__": diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md index bebdfd37..77b18828 100644 --- a/docs/tutorials/quick-start-with-self-hosted-model.md +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -56,7 +56,7 @@ All the models support streaming. * Set `OLLAMA_CHAT_MODEL` for the name of a text model. * Set `OLLAMA_VISION_MODEL` for the name of a vision model. * Set `OLLAMA_EMBEDDING_MODEL` for the name of an embedding model. - + **Note**: It's not necessary to configure all the models. If a model isn't set, then it won't be downloaded. 2. Then run the following command to pull and load into the memory of the Ollama server the specified models: @@ -66,10 +66,14 @@ All the models support streaming. ``` > Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or dozens of minutes)_ to download them on the first run depending on your Internet bandwidth. + > + > The model is fully loaded once `ollama-setup` service prints `The Ollama server is up and running.` 3. Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select an appropriate AI DIAL deployments to converse with: * `Self-hosted chat model` deployment for the `OLLAMA_CHAT_MODEL` * `Self-hosted vision model` deployment for the `OLLAMA_VISION_MODEL` -The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`. \ No newline at end of file +> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond. + +The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`.