Skip to content

Commit

Permalink
Merge branch 'collaboration-section' of https://github.com/epam/ai-dial
Browse files Browse the repository at this point in the history
… into collaboration-section
  • Loading branch information
sr-remsha committed Sep 23, 2024
2 parents e3a9fc5 + 97f55a4 commit eade006
Show file tree
Hide file tree
Showing 14 changed files with 196 additions and 64 deletions.
3 changes: 2 additions & 1 deletion dial-docker-compose/ci/ollama/test/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ async def dial_chat_completion(deployment_id: str, messages: list) -> str:
payload = {
"model": deployment_id,
"messages": messages,
"stream": False,
"temperature": 0.0,
"stream": False
}
headers = {"api-key": DIAL_API_KEY}
params = {"api-version": DIAL_API_VERSION}
Expand Down
1 change: 1 addition & 0 deletions dial-docker-compose/common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ services:
DIAL_API_HOST: "http://core:8080"
DIAL_API_KEY: "dial_api_key"
ENABLED_FEATURES: "conversations-section,prompts-section,top-settings,top-clear-conversation,top-chat-info,top-chat-model-settings,empty-chat-settings,header,footer,request-api-key,report-an-issue,likes,input-files,attachments-manager"
KEEP_ALIVE_TIMEOUT: ${CHAT_KEEP_ALIVE_TIMEOUT}

redis:
image: redis:7.2.4-alpine3.19
Expand Down
2 changes: 2 additions & 0 deletions dial-docker-compose/ollama/.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
DIAL_DIR="./ollama"
CHAT_KEEP_ALIVE_TIMEOUT=600000

OLLAMA_CHAT_MODEL=
OLLAMA_VISION_MODEL=
OLLAMA_EMBEDDING_MODEL=
5 changes: 0 additions & 5 deletions dial-docker-compose/ollama/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,6 @@ services:
- OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL}
- OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL}
- OLLAMA_EMBEDDING_MODEL=${OLLAMA_EMBEDDING_MODEL}
healthcheck:
test: ["CMD", "test", "-f", "/healthy"]
interval: 10s
start_period: 10s
retries: 10

adapter-openai:
image: epam/ai-dial-adapter-openai:0.14.0
Expand Down
2 changes: 2 additions & 0 deletions dial-docker-compose/ollama/ollama_setup/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.dockerignore
.venv
9 changes: 8 additions & 1 deletion dial-docker-compose/ollama/ollama_setup/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
FROM python:3.11-alpine

RUN apk --no-cache add curl

WORKDIR /app
COPY * /app
RUN pip install -r requirements.txt

CMD ["sh", "-c", "python setup.py && tail -f /dev/null"]
EXPOSE 5000

HEALTHCHECK --interval=10s --timeout=1s --start-period=10s --retries=10 \
CMD curl --fail http://localhost:5000/health || exit 1

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "5000"]
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import asyncio
from contextlib import asynccontextmanager
import os
import sys
import time
import asyncio
from fastapi import FastAPI
from ollama import AsyncClient
from tqdm import tqdm

from utils import Writer, print_info, timer

OLLAMA_URL = os.getenv("OLLAMA_URL")
if OLLAMA_URL is None:
raise RuntimeError("OLLAMA_URL env var isn't set")
Expand All @@ -15,46 +16,15 @@
OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL")
OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL")

HEALTH_FILE = "/healthy"


class Writer:
@classmethod
def write(cls, s: str):
# NOTE: every tqdm progress bar update is deliberately ended with "\n",
# otherwise one wouldn't see the bar running in console upon running `docker compose up`.
print(s, file=sys.stdout, flush=True, end="\n")

@classmethod
def flush(cls):
sys.stdout.flush()


print_info = Writer.write

print_info(f"OLLAMA_URL = {OLLAMA_URL}")
print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}")
print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}")
print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}")


@asynccontextmanager
async def timer(name: str):
print_info(f"[{name}] Starting...")
start = time.perf_counter()
yield
elapsed = time.perf_counter() - start
print_info(f"[{name}] Finished in {elapsed:.2f} seconds")


async def wait_for_startup():
attempt = 0
attempts = 0
while True:
attempt += 1
attempts += 1
try:
await AsyncClient(host=OLLAMA_URL, timeout=5).ps()
except Exception:
print_info(f"[{attempt:>3}] Waiting for Ollama to start...")
print_info(f"[{attempts:>3}] Waiting for Ollama to start...")
await asyncio.sleep(5)
else:
break
Expand All @@ -73,30 +43,34 @@ async def pull_model(client: AsyncClient, model: str):

if status != prev_status and total:
prev_status = status
if progress_bar:
progress_bar.close()
progress_bar = tqdm(
total=total, unit="B", unit_scale=True, desc=f"[{status}]", file=Writer
total=total,
unit="B",
unit_scale=True,
desc=f"[{status}]",
mininterval=1,
file=Writer,
)

if completed and progress_bar and total:
if completed and total and progress_bar:
progress_bar.n = completed
progress_bar.set_description(f"[{status}]")
progress_bar.refresh()
progress_bar.update(n=0)

if total and total == completed and progress_bar:
progress_bar.close()
progress_bar = None

if not completed and not total:
print_info(f"[{status}]")


async def create_health_mark():
open(HEALTH_FILE, "w").close()

async def startup():
print_info(f"OLLAMA_URL = {OLLAMA_URL}")
print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}")
print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}")
print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}")

async def main():
client = AsyncClient(host=OLLAMA_URL, timeout=300000)
client = AsyncClient(host=OLLAMA_URL, timeout=300)

async with timer("Waiting for Ollama to start"):
await wait_for_startup()
Expand All @@ -117,11 +91,18 @@ async def main():
async with timer(f"Loading model {model_to_load} into memory"):
await client.generate(model_to_load)

await create_health_mark()

print_info("The Ollama server is up and running.")


if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
@asynccontextmanager
async def lifespan(app):
await startup()
yield


app = FastAPI(lifespan=lifespan)


@app.get("/health")
def health_check():
return {"status": "ok"}
4 changes: 3 additions & 1 deletion dial-docker-compose/ollama/ollama_setup/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
httpx==0.27.2
tqdm==4.66.5
ollama==0.3.3
ollama==0.3.3
fastapi==0.115.0
uvicorn==0.30.6
38 changes: 38 additions & 0 deletions dial-docker-compose/ollama/ollama_setup/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import logging
import sys
import time
from contextlib import asynccontextmanager


class Writer:
@classmethod
def write(cls, s: str):
# NOTE: every tqdm progress bar update is deliberately ended with "\n",
# otherwise one wouldn't see the bar running in console upon running `docker compose up`.
if s in ["\n", ""]:
return
print(s.strip(), file=sys.stderr, flush=True, end="\n")

@classmethod
def flush(cls):
sys.stderr.flush()


print_info = Writer.write


@asynccontextmanager
async def timer(name: str):
print_info(f"[{name}] Starting...")
start = time.perf_counter()
yield
elapsed = time.perf_counter() - start
print_info(f"[{name}] Finished in {elapsed:.2f} seconds")


class HealthFilter(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
return record.getMessage().find("/health") == -1


logging.getLogger("uvicorn.access").addFilter(HealthFilter())
2 changes: 2 additions & 0 deletions docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ Examples of the computed artifacts:
- Language of conversations.
- Any other calculated statistics based on conversations.

> Refer to [Tutorials](/tutorials/realtime-analytics) to learn more.
## Extensions

Extensions such as Applications, Addons, Assistants and Adapters can be additionally developed and deployed to communicate with the AI DIAL Core via the Unified Protocol.
Expand Down
Binary file added docs/tutorials/img/grafana.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 4 additions & 4 deletions docs/tutorials/quick-start-with-self-hosted-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,15 @@ All the models support streaming.
docker compose up --abort-on-container-exit
```

> Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or dozens of minutes)_ to download them on the first run depending on your Internet bandwidth.
> Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or more)_ to download it on the first run, depending on your internet bandwidth and the size of the model you choose.
>
> The model is fully loaded once `ollama-setup` service prints `The Ollama server is up and running.`
> The models are fully loaded once `ollama-setup` service prints `The Ollama server is up and running.`

3. Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select an appropriate AI DIAL deployments to converse with:

* `Self-hosted chat model` deployment for the `OLLAMA_CHAT_MODEL`
* `Self-hosted vision model` deployment for the `OLLAMA_VISION_MODEL`

> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond.
> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond.

The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`.
The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`.
96 changes: 96 additions & 0 deletions docs/tutorials/realtime-analytics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Analytics

## Introduction

[DIAL Core](https://github.com/epam/ai-dial-core) collects **system logs** and **chat completion logs**:

* System logs do not include any user data and contain logs of all requests from system components to AI DIAL Core (using the ELK stack (Elasticsearch, Logstash, Kibana) or other log collection system). **Note**: this document does not cover system logs.
* [Chat completion requests](https://epam-rail.com/dial_api#/paths/~1openai~1deployments~1%7BDeployment%20Name%7D~1chat~1completions/post) logs include information that users send in their requests to LLMs and the information they get in responses.

> **Important**: AI DIAL Core **will log** user requests and deployment responses unless the requests are authorized with a **secured** API key. This means that user requests and deployment responses will be recorded in logs for all requests authorized with an access token (like JWT) or unsecured API keys. Refer to [configuration](https://github.com/epam/ai-dial-core?tab=readme-ov-file#dynamic-settings) to learn how to create secured and other types of API keys.
[AI DIAL setup](../architecture#full-platform-landscape) can include a special service called DIAL Analytics Realtime, which uses diverse techniques such as embedding algorithms, clustering algorithms, frameworks, light-weight self-hosted language models, to analyze **chat completion logs** and extract the needed information, which can be presented in tools such as Grafana for visualization and analytics.

> Refer to [Analytics Realtime](https://github.com/epam/ai-dial-analytics-realtime) repository to learn more and view the project source code.
Analytics Realtime does not retain any private information, such as user prompts or conversations, beyond the system. Instead, only the computed artifacts are collected and stored in time-series databases like InfluxDB or any scalable database capable of handling voluminous, constantly changing information.

Examples of the computed artifacts:

* Who has used the AI? – user hash, title, and never personal data such as names.
* What areas have people asked questions about?
* Are there any recurring patterns?
* Topics of conversations.
* Unique users.
* Sentiments.
* Cost analysis of the communication.
* Language of conversations.
* Any other calculated statistics based on conversations.

## Configuration

This section outlines the required steps for configuring Analytics Realtime service and other necessary components:

- Step 1: Configure [DIAL Core](https://github.com/epam/ai-dial-core)
- Step 2: Install [Influx DB](https://github.com/influxdata/influxdb)
- Step 3: Configure [DIAL Analytics Realtime](https://github.com/epam/ai-dial-analytics-realtime)
- Step 4: Configure [Log Collector](https://github.com/vectordotdev/vector)
- Step 5: Configure [Grafana](https://github.com/grafana/grafana)

**Flow:**

AI DIAL Core generates a `.log` file containing chat completion logs. A log collector tool then transfers this file to AI DIAL Analytics Realtime for analysis. The insights derived from the analysis are stored in InfluxDB and can subsequently be visualized using Grafana.

### Step 1: AI DIAL Core

AI DIAL Core can be configured to write chat completion logs into a specific `.log` file.

Use the default AI DIAL Core [Configuration](https://github.com/epam/ai-dial-core/blob/development/src/main/resources/gflog.xml) as reference. Refer to [GFLog](https://github.com/epam/gflog/blob/main/README.md) to learn more about the logging framework used in DIAL Core.

### Step 2: Influx DB

Analytics Realtime uses InfluxDB to store the analytics of chat completion logs. Refer to InfluxDB documentation to learn how to [install](https://docs.influxdata.com/influxdb/v2/install/) it and how to [create tokens](https://docs.influxdata.com/influxdb/v2/admin/tokens/create-token/) to read from a bucket.

> Refer to [Configuration](https://github.com/epam/ai-dial-analytics-realtime?tab=readme-ov-file#configuration) to view how to configure InfluxDB for Analytics Realtime service.
### Step 3: AI DIAL Analytics Realtime

Follow the [instructions](https://github.com/epam/ai-dial-analytics-realtime/blob/development/README.md) to setup AI DIAL Analytics Realtime service.

### Step 4: Log Collector

AI DIAL uses the external open-source solution [Vector](https://github.com/vectordotdev/vector) as a log collector to transfer a `.log` file with **chat completion logs** to AI DIAL Analytics Realtime service via HTTP. Analytics Realtime functions as a "sink" for Vector, providing an endpoint designed to receive logs from it. To connect, you only need to know the hostname and port, such as http://localhost:5001/data - see the example below.

> You can find an additional information on delivering observability data to an HTTP server in the Vector [documentation](https://vector.dev/docs/reference/configuration/sinks/http).
This is an example of Vector configuration:

```yaml
sources:
aidial_logs:
type: "file"
max_line_bytes: 100000000
oldest_first: true
include:
- /app/log/*.log # file with chat completion logs
http_analytics_opensource:
inputs:
- aidial_logs
type: http
uri: http://dial-analytics.dial:80/data # Analytics Realtime URI
request:
timeout_secs: 300
batch:
max_bytes: 1049000
timeout_secs: 60
encoding:
codec: "json"
```
### Step 5: Grafana
Grafana can be [configured](https://grafana.com/docs/grafana/latest/datasources/influxdb/#influxdb-data-source) to use InfluxDB with analytics of DIAL logs as a data source. You can use pre-configured samples of [dashboards](https://github.com/epam/ai-dial-analytics-realtime/blob/development/dashboards/README.md) to visualize data in Grafana.
> Refer to Grafana documentation to learn how to [install](https://grafana.com/docs/grafana/latest/setup-grafana/installation/) it and [import dashboards](https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/import-dashboards/).
![](img/grafana.png)
5 changes: 5 additions & 0 deletions sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,11 @@ const sidebars = {
label: 'Notifications'
}
]
},
{
type: 'doc',
id: 'tutorials/realtime-analytics',
label: 'Analytics Realtime',
}
],
},
Expand Down

0 comments on commit eade006

Please sign in to comment.