diff --git a/docs/api/client.md b/docs/api/client.md new file mode 100644 index 00000000..f79796a2 --- /dev/null +++ b/docs/api/client.md @@ -0,0 +1,135 @@ +# API Client + +## Python + +=== "Synchronous" + + ```python linenums="1" + import os + import requests + + LLM_GUARD_API_KEY = os.environ.get("LLM_GUARD_API_KEY") + LLM_GUARD_BASE_URL = os.environ.get("LLM_GUARD_URL") + + class LLMGuardMaliciousPromptException(Exception): + scores = {} + + def __init__(self, *args, **kwargs): + super().__init__(*args) + self.scores = kwargs.get("scores", {}) + + def __str__(self): + scanners = [scanner for scanner, score in self.scores.items() if score > 0] + + return f"LLM Guard detected a malicious prompt. Scanners triggered: {', '.join(scanners)}; scores: {self.scores}" + + + class LLMGuardRequestException(Exception): + pass + + def request_llm_guard_prompt(prompt: str): + try: + response = requests.post( + url=f"{LLM_GUARD_BASE_URL}/analyze/prompt", + json={"prompt": prompt}, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {LLM_GUARD_API_KEY}", + }, + ssl=False, + ) + + response_json = response.json() + except requests.RequestException as err: + raise LLMGuardRequestException(err) + + if not response_json["is_valid"]: + raise LLMGuardMaliciousPromptException(scores=response_json["scanners"]) + + return response_json["sanitized_prompt"] + + prompt = "Write a Python function to calculate the factorial of a number." + sanitized_prompt = request_llm_guard_prompt(prompt) + print(sanitized_prompt) + ``` + +=== "Call LLM provider and LLM Guard API in parallel" + + ```python linenums="1" + import os + import asyncio + import aiohttp + from openai import AsyncOpenAI + + LLM_GUARD_API_KEY = os.environ.get("LLM_GUARD_API_KEY") + LLM_GUARD_BASE_URL = os.environ.get("LLM_GUARD_URL") + openai_client = AsyncOpenAI( + api_key=os.environ.get("OPENAI_API_KEY"), + ) + system_prompt = "You are a Python tutor." + + class LLMGuardMaliciousPromptException(Exception): + scores = {} + + def __init__(self, *args, **kwargs): + super().__init__(*args) + self.scores = kwargs.get("scores", {}) + + def __str__(self): + scanners = [scanner for scanner, score in self.scores.items() if score > 0] + + return f"LLM Guard detected a malicious prompt. Scanners triggered: {', '.join(scanners)}; scores: {self.scores}" + + + class LLMGuardRequestException(Exception): + pass + + async def request_openai(prompt: str) -> str: + chat_completion = await openai_client.chat.completions.create( + messages=[ + { + "role": "system", + "content": system_prompt, + }, + {"role": "user", "content": prompt}, + ], + model="gpt-3.5-turbo", + ) + + return chat_completion.choices[0].message.content + + + async def request_llm_guard_prompt(prompt: str): + async with aiohttp.ClientSession() as session: + try: + response = await session.post( + url=f"{LLM_GUARD_BASE_URL}/analyze/prompt", + json={"prompt": prompt}, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {LLM_GUARD_API_KEY}", + }, + ssl=False, + raise_for_status=True, + ) + + response_json = await response.json() + except Exception as e: + raise LLMGuardRequestException(e) + + if not response_json["is_valid"]: + raise LLMGuardMaliciousPromptException(scores=response_json["scanners"]) + + async def generate_completion(prompt: str) -> str: + result = await asyncio.gather( + request_llm_guard_prompt(prompt), + request_openai(prompt), + ) + + return result[1] + + prompt = "Write a Python function to calculate the factorial of a number." + message = asyncio.run( + generate_completion(prompt) + ) + ``` diff --git a/docs/api/deployment.md b/docs/api/deployment.md new file mode 100644 index 00000000..a0ed2c15 --- /dev/null +++ b/docs/api/deployment.md @@ -0,0 +1,63 @@ +# API Deployment + +## From source + +1. Copy the code from [llm_guard_api](https://github.com/protectai/llm-guard/tree/main/llm_guard_api) + +2. Install dependencies (preferably in a virtual environment) +```bash +python -m pip install ".[cpu]" +python -m pip install ".[gpu]" # If you have a GPU +``` + +3. Alternatively, you can use Makefile: +```bash +make install +``` + +4. Run the API locally: +```bash +make run +``` + +## From Docker + +Either build the Docker image or pull our official image from [Docker Hub](https://hub.docker.com/r/laiyer/llm-guard-api). + +In order to build the Docker image, run the following command: + +```bash +make build-docker-multi +make build-docker-cuda-multi # If you have a GPU +``` + +Or pull the official image: + +```bash +docker pull laiyer/llm-guard-api:latest +``` + +Now, you can run the Docker container: + +```bash +docker run -d -p 8000:8000 -e LOG_LEVEL='DEBUG' -e API_TOKEN='my-token' laiyer/llm-guard-api:latest +``` + +This will start the API on port 8000. You can now access the API at `http://localhost:8000/swagger.json`. + +If you want to use a custom configuration, you can mount a volume to `/home/user/app/config`: + +```bash +docker run -d -p 8000:8000 -e LOG_LEVEL='INFO' -v ./config:/home/user/app/config laiyer/llm-guard-api:latest +``` + +!!! warning + + We recommend at least 16GB of RAM allocated to Docker. We are working on optimizing the memory usage when the container starts. + +## Troubleshooting + +### Out-of-memory error + +If you get an out-of-memory error, you can change `config.yml` file to use less scanners. +Alternatively, you can enable `low_cpu_mem_usage` in scanners that rely on HuggingFace models. diff --git a/docs/api/overview.md b/docs/api/overview.md new file mode 100644 index 00000000..61795a57 --- /dev/null +++ b/docs/api/overview.md @@ -0,0 +1,54 @@ +# API + +LLM Guard can be deployed as an API. We rely on [FastAPI](https://fastapi.tiangolo.com/) and [Uvicorn](https://www.uvicorn.org/) to serve the API. + +## Configuration + +All configurations are stored in `config/scanners.yml`. It supports configuring via environment variables. + +!!! note + + Scanners will be executed in the order of configuration. + +### Default environment variables + +- `LOG_LEVEL` (bool): Log level. Default is `INFO`. If set as `DEBUG`, debug mode will be enabled, which makes Swagger UI available. +- `CACHE_MAX_SIZE` (int): Maximum number of items in the cache. Default is unlimited. +- `CACHE_TTL` (int): Time in seconds after which a cached item expires. Default is 1 hour. +- `SCAN_FAIL_FAST` (bool): Stop scanning after the first failed check. Default is `False`. +- `SCAN_PROMPT_TIMEOUT` (int): Time in seconds after which a prompt scan will timeout. Default is 10 seconds. +- `SCAN_OUTPUT_TIMEOUT` (int): Time in seconds after which an output scan will timeout. Default is 30 seconds. +- `APP_PORT` (int): Port to run the API. Default is `8000`. + +### Best practices + +1. Enable `SCAN_FAIL_FAST` to avoid unnecessary scans. +2. Enable `CACHE_MAX_SIZE` and `CACHE_TTL` to cache results and avoid unnecessary scans. +3. Enable authentication and rate limiting to avoid abuse. + +## Observability + +There are built-in environment variables to configure observability: + +- [FastAPI Instrumentation](https://opentelemetry-python-contrib.readthedocs.io/en/latest/instrumentation/fastapi/fastapi.html) +- [OpenTelemetry](https://opentelemetry.io/) + +### Logging + +Logs are written to `stdout` in a structured format, which can be easily parsed by log management systems. + +### Metrics + +The following exporters are available for metrics: + +- **Console (console)**: Logs metrics to `stdout`. +- **Prometheus (prometheus)**: Exposes metrics on `/metrics` endpoint. +- **OpenTelemetry (otel_http)**: Sends metrics to an OpenTelemetry collector via HTTP endpoint. + +### Tracing + +The following exporters are available for tracing: + +- **Console (console)**: Logs traces to `stdout` +- **OpenTelemetry (otel_http)**: Sends traces to an OpenTelemetry collector via HTTP endpoint. +- **AWS X-Ray (xray)**: Sends traces to OpenTelemetry collector in the AWS X-Ray format. diff --git a/docs/api/reference.md b/docs/api/reference.md new file mode 100644 index 00000000..011e1b8e --- /dev/null +++ b/docs/api/reference.md @@ -0,0 +1,3 @@ +# API Reference + + diff --git a/mkdocs.yml b/mkdocs.yml index 089eaf20..6e99a261 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -100,12 +100,11 @@ nav: - Index: index.md - Installation: get_started/installation.md - Quickstart: get_started/quickstart.md - - Playground: usage/playground.md + - Playground: get_started/playground.md - Attacks: get_started/attacks.md - - Best Practices: usage/best_practices.md - - Optimization: usage/optimization.md + - Best Practices: get_started/best_practices.md + - Optimization: get_started/optimization.md - Usage: - - Deployment as API: usage/api.md - OpenAI SDK: usage/openai.md - Langchain: usage/notebooks/langchain.ipynb - Retrieval-augmented Generation: usage/rag.md @@ -147,6 +146,11 @@ nav: - Sentiment: output_scanners/sentiment.md - Toxicity: output_scanners/toxicity.md - URL Reachability: output_scanners/url_reachability.md + - API: + - Overview: api/overview.md + - Deployment: api/deployment.md + - API Reference: api/reference.md + - Client: api/client.md - Changelog: changelog.md - Customization: - Add scanner: customization/add_scanner.md