Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2024OSPP——基于AI网关实现AI模型的轻量化部署 #1

1 change: 1 addition & 0 deletions api/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ proto_library(
"//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/squash/v3:pkg",
"//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
"//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",
"//contrib/envoy/extensions/filters/network/generic_proxy/action/v3:pkg",
"//contrib/envoy/extensions/filters/network/generic_proxy/codecs/dubbo/v3:pkg",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py.

load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package")

licenses(["notice"]) # Apache 2

api_proto_package(
deps = ["@com_github_cncf_udpa//udpa/annotations:pkg"],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
syntax = "proto3";

package envoy.extensions.filters.http.llm_inference.v3;

import "udpa/annotations/status.proto";
import "validate/validate.proto";

option java_package = "io.envoyproxy.envoy.extensions.filters.http.llm_inference.v3";
option java_outer_classname = "LlmInferenceProto";
option java_multiple_files = true;
option go_package = "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/http/llm_inference/v3;llm_inferencev3";
option (udpa.annotations.file_status).package_version_status = ACTIVE;

message modelParameter {
int32 n_threads = 1;

int32 n_parallel = 2;

map<string, string> chat_modelpath = 3;

map<string, string> embedding_modelpath = 4;
}

message modelChosen {
string usemodel = 1;

int32 first_byte_timeout = 2;

int32 inference_timeout = 3;
}
1 change: 1 addition & 0 deletions api/versioning/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ proto_library(
"//contrib/envoy/extensions/config/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/golang/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
"//contrib/envoy/extensions/filters/http/squash/v3:pkg",
"//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
"//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",
Expand Down
21 changes: 21 additions & 0 deletions bazel/foreign_cc/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -570,3 +570,24 @@ envoy_cmake(
}),
working_directory = "build/cmake",
)

envoy_cmake(
name = "llama",
cache_entries = {
"CMAKE_INSTALL_LIBDIR": "lib",
"BUILD_SHARED_LIBS": "off",
"CMAKE_BUILD_TYPE": "Release"
},
linkopts = ["-fopenmp"],
lib_source = "@com_github_ggerganov_llama//:all",
out_static_libs = select({
"//conditions:default": [
"libllama.a",
"libggml.a",
],
}),
tags = ["skip_on_windows"],
postfix_script = select({
"//conditions:default": "rm -rf $INSTALLDIR/include/common && mkdir $INSTALLDIR/include/common && cp -rL $EXT_BUILD_ROOT/external/com_github_ggerganov_llama/common/* $INSTALLDIR/include/common",
}),
)
12 changes: 12 additions & 0 deletions bazel/repositories.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def envoy_dependencies(skip_targets = []):
_com_github_google_libprotobuf_mutator()
_com_github_google_libsxg()
_com_github_google_tcmalloc()
_com_github_ggerganov_llama()
_com_github_gperftools_gperftools()
_com_github_grpc_grpc()
_com_github_unicode_org_icu()
Expand Down Expand Up @@ -1238,6 +1239,17 @@ def _com_github_google_tcmalloc():
actual = "@com_github_google_tcmalloc//tcmalloc:malloc_extension",
)

def _com_github_ggerganov_llama():
external_http_archive(
name = "com_github_ggerganov_llama",
build_file_content = BUILD_ALL_CONTENT,
)

native.bind(
name = "llama",
actual = "@envoy//bazel/foreign_cc:llama",
)

def _com_github_gperftools_gperftools():
external_http_archive(
name = "com_github_gperftools_gperftools",
Expand Down
12 changes: 12 additions & 0 deletions bazel/repository_locations.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,18 @@ REPOSITORY_LOCATIONS_SPEC = dict(
license = "Apache-2.0",
license_url = "https://github.com/google/tcmalloc/blob/{version}/LICENSE",
),
com_github_ggerganov_llama = dict(
project_name = "llama.cpp",
project_desc = "LLM inference in C/C++",
project_url = "https://github.com/ggerganov/llama.cpp",
version = "947538acb8617756a092042ff7e58db18dde05ec",
sha256 = "566ec06009584be8303d5d4b0070ccb0b531695fef3008019e1db97bb7c427c4",
strip_prefix = "llama.cpp-{version}",
urls = ["https://github.com/ggerganov/llama.cpp/archive/{version}.zip"],
use_category = ["dataplane_core"],
release_date = "2024-09-06",
cpe = "N/A",
),
com_github_gperftools_gperftools = dict(
project_name = "gperftools",
project_desc = "tcmalloc and profiling libraries",
Expand Down
1 change: 1 addition & 0 deletions contrib/contrib_build_config.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ CONTRIB_EXTENSIONS = {
"envoy.filters.http.language": "//contrib/language/filters/http/source:config_lib",
"envoy.filters.http.squash": "//contrib/squash/filters/http/source:config",
"envoy.filters.http.sxg": "//contrib/sxg/filters/http/source:config",
"envoy.filters.http.llm_inference": "//contrib/llm_inference/filters/http/source:config",

#
# Upstreams
Expand Down
5 changes: 5 additions & 0 deletions contrib/extensions_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ envoy.filters.http.sxg:
- envoy.filters.http
security_posture: robust_to_untrusted_downstream
status: alpha
envoy.filters.http.llm_inference:
categories:
- envoy.filters.http
security_posture: requires_trusted_downstream_and_upstream
status: wip
envoy.filters.network.client_ssl_auth:
categories:
- envoy.filters.network
Expand Down
146 changes: 146 additions & 0 deletions contrib/llm_inference/filters/http/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Filter 配置使用说明

## 概述

本项目实现了一个 HTTP Filter,该`filter`会解析推理请求,并调用异步推理线程实现推理过程,同时给该异步线程一个回调函数,实现流式传输的大模型推理过程。此文档将指导您如何配置和使用 `filter`,以及在性能方面与 Ollama 进行对比。

## 配置使用方式

### 配置 Filter

1、在配置文件中,您需要首先设置filter级的配置,例如:

```json
- name: envoy.filters.http.llm_inference
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.llm_inference.v3.modelParameter
n_threads : 100
n_parallel : 5
chat_modelpath: {
"qwen2": "/home/yuanjq/model/qwen2-7b-instruct-q5_k_m.gguf",
"llama3": "/home/yuanjq/model/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
}
embedding_modelpath: {
"bge": "/home/yuanjq/model/bge-small-zh-v1.5-f32.gguf"
}
```
其中
n_threads: 表示推理线程能用的最大线程数
n_parallel: 表示推理服务的最大并行请求数
YJQ1101 marked this conversation as resolved.
Show resolved Hide resolved
chat_modelpath: 表示chat模型本地路径
embedding_modelpath: 表示embedding模型本地路径

2、在route_config中明确您对router级配置,即需要路由使用到的模型,例如:
```
route_config:
name: route
virtual_hosts:
- name: llm_inference_service
domains: ["api.openai.com"]
routes:
- match:
prefix: "/v1/chat/completions"
typed_per_filter_config:
envoy.filters.http.llm_inference:
"@type": type.googleapis.com/envoy.extensions.filters.http.llm_inference.v3.modelChosen
usemodel: "qwen2"
first_byte_timeout : 4
inference_timeout : 90
direct_response:
status: 504
body:
inline_string: "inference timeout"
```
其中
usemodel: 表示使用的模型,模型名字与modelpath里面设置的要对应
first_byte_timeout: 表示首字节超时时间
inference_timeout: 表示总推理超时时间

### 更新 Filter
本项目可以动态地加载和卸载使用模型,您只需添加或删除chat_modelpath、embedding_modelpath里面的模型文件路径,再更新配置文件,即可动态地加载和卸载模型。需要注意的是,卸载了模型之后要确保router级配置里面使用的模型没有被卸载。


## 使用注意事项

1. **参数设置**:请根据具体场景调整 `n_threads` 、`n_parallel`的参数,以确保最佳性能。
2. **模型选用**:确保模型在本地中的路径是正确的,否则加载模型的时候会报错;同时需要用户区分该模型是否是embedding模型。
3. **并发处理**:确保服务器具有足够的内存和cpu资源,因为一般模型都有几个GB,同时模型推理是一个计算密集型任务,它需要在大量的数据上进行矩阵运算和张量操作。

## 性能对比与测试

为了评估 `filter` 的性能,现与 Ollama 进行以下对比:

### 1. 相同模型与问题

确保在相同模型和问题的条件下进行测试,使用以下步骤:

- **模型选择**:选择相同的预训练模型。
这里我们使用alibaba的**qwen2.5-7b-instruct-q3_k_m.gguf**模型
- **输入问题**:使用相同的输入数据进行推理。
这里我们相同的请求,要求最多生成500个词:
```
curl http://localhost:10000/v1/chat/completions \
-H "host:api.openai.com" \
-d '{
"model": "qwen2.5",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Hello! Building a website can be done in 10 simple steps:"
}
],
"stream": true,
"n_predict": 500
}'

```
### 2. 并发测试

在不同的并发级别下(如 1、4、8 个请求)进行性能测试,并记录以下指标:

- **资源开销**:内存使用情况。
- **响应延迟**:每个请求的响应时间。
- **推理延迟**:每个请求的推理时间。

其中,4、8个请求的时候,我们把内存使用、延迟时间求平均值作为指标
### 3. cpu核数设置与数据记录
- cpu使用8核,即n_threads = 8
- 使用性能监控工具(htop)记录资源使用情况。
- 记录时间并进行对比分析。

### 4. 对比结果
- **内存资源开销**
YJQ1101 marked this conversation as resolved.
Show resolved Hide resolved

并发请求数 | LLM Inference Filter | Ollama
-------- |-------- | -----
1 | 7.1GB | 7.1GB
4 | 7.2GB| 7.2GB
8 | 7.2GB| 7.2GB

- **响应延迟**
YJQ1101 marked this conversation as resolved.
Show resolved Hide resolved

并发请求数 | LLM Inference Filter | Ollama
-------- |-------- | -----
1 | 2633.20 ms / 34 tokens | 1336.57 ms / 15 tokens
4 | 2873.74 ms / 34 tokens | 2196.26 ms / 15 tokens
8 | 2969.98 ms / 34 tokens | 2077.51 ms / 15 tokens

- **推理延迟**

并发请求数 | LLM Inference Filter | Ollama
-------- |-------- | -----
1 | 55543.16 ms | 62373.26 ms
4 | 169539.01 ms| 231860.54ms
8 | 316113.34 ms | 477764.59 ms


## 结论



通过上述方法,您可以有效地配置和使用 `filter`,并与 Ollama 在性能上进行对比。欢迎提交反馈和建议,以帮助我们持续改进项目。

37 changes: 37 additions & 0 deletions contrib/llm_inference/filters/http/source/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
load(
"//bazel:envoy_build_system.bzl",
"envoy_cc_contrib_extension",
"envoy_cc_library",
"envoy_contrib_package",
)

licenses(["notice"]) # Apache 2

envoy_contrib_package()

envoy_cc_library(
name = "llm_inference_filter_lib",
srcs = ["llm_inference_filter.cc"],
hdrs = ["llm_inference_filter.h"],
deps = [
"@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
"//source/extensions/filters/http/common:pass_through_filter_lib",
"//contrib/llm_inference/filters/http/source/inference:inference",
"//source/common/http:header_map_lib",
"//source/common/http:header_utility_lib",
"//source/common/http:headers_lib",
"//source/common/protobuf:utility_lib",
],
)

envoy_cc_contrib_extension(
name = "config",
srcs = ["config.cc"],
hdrs = ["config.h"],
deps = [
":llm_inference_filter_lib",
"//envoy/registry",
"//source/extensions/filters/http/common:factory_base_lib",
"@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
],
)
Loading