higress-group · johnlanni · Oct 8, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/api/BUILD b/api/BUILD
@@ -78,6 +78,7 @@ proto_library(
         "//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
         "//contrib/envoy/extensions/filters/http/squash/v3:pkg",
         "//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
+        "//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
         "//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",
         "//contrib/envoy/extensions/filters/network/generic_proxy/action/v3:pkg",
         "//contrib/envoy/extensions/filters/network/generic_proxy/codecs/dubbo/v3:pkg",

diff --git a/api/contrib/envoy/extensions/filters/http/llm_inference/v3/BUILD b/api/contrib/envoy/extensions/filters/http/llm_inference/v3/BUILD
@@ -0,0 +1,9 @@
+# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py.
+
+load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package")
+
+licenses(["notice"])  # Apache 2
+
+api_proto_package(
+    deps = ["@com_github_cncf_udpa//udpa/annotations:pkg"],
+)
diff --git a/api/contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.proto b/api/contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.proto
@@ -0,0 +1,30 @@
+syntax = "proto3";
+
+package envoy.extensions.filters.http.llm_inference.v3;
+
+import "udpa/annotations/status.proto";
+import "validate/validate.proto";
+
+option java_package = "io.envoyproxy.envoy.extensions.filters.http.llm_inference.v3";
+option java_outer_classname = "LlmInferenceProto";
+option java_multiple_files = true;
+option go_package = "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/http/llm_inference/v3;llm_inferencev3";
+option (udpa.annotations.file_status).package_version_status = ACTIVE;
+
+message modelParameter {
+  int32 n_threads = 1;
+
+  int32 n_parallel = 2;
+
+  map<string, string> chat_modelpath = 3;
+
+  map<string, string> embedding_modelpath = 4;
+}
+
+message modelChosen {
+  string usemodel = 1;
+
+  int32 first_byte_timeout = 2;
+
+  int32 inference_timeout = 3;
+}
diff --git a/api/versioning/BUILD b/api/versioning/BUILD
@@ -15,6 +15,7 @@ proto_library(
         "//contrib/envoy/extensions/config/v3alpha:pkg",
         "//contrib/envoy/extensions/filters/http/golang/v3alpha:pkg",
         "//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
+        "//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
         "//contrib/envoy/extensions/filters/http/squash/v3:pkg",
         "//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
         "//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",

diff --git a/bazel/foreign_cc/BUILD b/bazel/foreign_cc/BUILD
@@ -570,3 +570,24 @@ envoy_cmake(
     }),
     working_directory = "build/cmake",
 )
+
+envoy_cmake(
+    name = "llama",
+    cache_entries = {
+        "CMAKE_INSTALL_LIBDIR": "lib",
+        "BUILD_SHARED_LIBS": "off",
+        "CMAKE_BUILD_TYPE": "Release"
+    },
+    linkopts = ["-fopenmp"],
+    lib_source = "@com_github_ggerganov_llama//:all",
+    out_static_libs = select({
+        "//conditions:default": [
+            "libllama.a",
+            "libggml.a",
+        ],
+    }),
+    tags = ["skip_on_windows"],
+    postfix_script = select({
+        "//conditions:default": "rm -rf $INSTALLDIR/include/common && mkdir $INSTALLDIR/include/common && cp -rL $EXT_BUILD_ROOT/external/com_github_ggerganov_llama/common/* $INSTALLDIR/include/common",
+    }),
+)
diff --git a/bazel/repositories.bzl b/bazel/repositories.bzl
@@ -278,6 +278,7 @@ def envoy_dependencies(skip_targets = []):
     _com_github_google_libprotobuf_mutator()
     _com_github_google_libsxg()
     _com_github_google_tcmalloc()
+    _com_github_ggerganov_llama()
     _com_github_gperftools_gperftools()
     _com_github_grpc_grpc()
     _com_github_unicode_org_icu()
@@ -1238,6 +1239,17 @@ def _com_github_google_tcmalloc():
         actual = "@com_github_google_tcmalloc//tcmalloc:malloc_extension",
     )
 
+def _com_github_ggerganov_llama():
+    external_http_archive(
+        name = "com_github_ggerganov_llama",
+        build_file_content = BUILD_ALL_CONTENT,
+    )
+
+    native.bind(
+        name = "llama",
+        actual = "@envoy//bazel/foreign_cc:llama",
+    )
+
 def _com_github_gperftools_gperftools():
     external_http_archive(
         name = "com_github_gperftools_gperftools",

diff --git a/bazel/repository_locations.bzl b/bazel/repository_locations.bzl
@@ -358,6 +358,18 @@ REPOSITORY_LOCATIONS_SPEC = dict(
         license = "Apache-2.0",
         license_url = "https://github.com/google/tcmalloc/blob/{version}/LICENSE",
     ),
+    com_github_ggerganov_llama = dict(
+        project_name = "llama.cpp",
+        project_desc = "LLM inference in C/C++",
+        project_url = "https://github.com/ggerganov/llama.cpp",
+        version = "947538acb8617756a092042ff7e58db18dde05ec",
+        sha256 = "566ec06009584be8303d5d4b0070ccb0b531695fef3008019e1db97bb7c427c4",
+        strip_prefix = "llama.cpp-{version}",
+        urls = ["https://github.com/ggerganov/llama.cpp/archive/{version}.zip"],
+        use_category = ["dataplane_core"],
+        release_date = "2024-09-06",
+        cpe = "N/A",
+    ),
     com_github_gperftools_gperftools = dict(
         project_name = "gperftools",
         project_desc = "tcmalloc and profiling libraries",

diff --git a/contrib/contrib_build_config.bzl b/contrib/contrib_build_config.bzl
@@ -10,6 +10,7 @@ CONTRIB_EXTENSIONS = {
     "envoy.filters.http.language":                              "//contrib/language/filters/http/source:config_lib",
     "envoy.filters.http.squash":                                "//contrib/squash/filters/http/source:config",
     "envoy.filters.http.sxg":                                   "//contrib/sxg/filters/http/source:config",
+    "envoy.filters.http.llm_inference":                         "//contrib/llm_inference/filters/http/source:config",
 
     #
     # Upstreams

diff --git a/contrib/extensions_metadata.yaml b/contrib/extensions_metadata.yaml
@@ -28,6 +28,11 @@ envoy.filters.http.sxg:
   - envoy.filters.http
   security_posture: robust_to_untrusted_downstream
   status: alpha
+envoy.filters.http.llm_inference:
+  categories:
+  - envoy.filters.http
+  security_posture: requires_trusted_downstream_and_upstream
+  status: wip
 envoy.filters.network.client_ssl_auth:
   categories:
   - envoy.filters.network

diff --git a/contrib/llm_inference/filters/http/README.md b/contrib/llm_inference/filters/http/README.md
@@ -0,0 +1,146 @@
+# Filter 配置使用说明
+
+## 概述
+
+本项目实现了一个 HTTP Filter，该`filter`会解析推理请求，并调用异步推理线程实现推理过程，同时给该异步线程一个回调函数，实现流式传输的大模型推理过程。此文档将指导您如何配置和使用 `filter`，以及在性能方面与 Ollama 进行对比。
+
+## 配置使用方式
+
+### 配置 Filter
+
+1、在配置文件中，您需要首先设置filter级的配置，例如：
+
+```json
+- name: envoy.filters.http.llm_inference
+  typed_config:
+    "@type": type.googleapis.com/envoy.extensions.filters.http.llm_inference.v3.modelParameter
+    n_threads : 100
+    n_parallel : 5
+    chat_modelpath: {
+      "qwen2": "/home/yuanjq/model/qwen2-7b-instruct-q5_k_m.gguf",
+      "llama3": "/home/yuanjq/model/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
+    }
+    embedding_modelpath: {
+      "bge": "/home/yuanjq/model/bge-small-zh-v1.5-f32.gguf"
+    }
+```
+其中
+n_threads: 表示推理线程能用的最大线程数
+n_parallel: 表示推理服务的最大并行请求数
+chat_modelpath: 表示chat模型本地路径
+embedding_modelpath: 表示embedding模型本地路径
+
+2、在route_config中明确您对router级配置，即需要路由使用到的模型，例如：
+```
+route_config:
+  name: route
+  virtual_hosts:
+  - name: llm_inference_service
+    domains: ["api.openai.com"]
+    routes:
+    - match:
+        prefix: "/v1/chat/completions"
+      typed_per_filter_config:
+        envoy.filters.http.llm_inference:
+          "@type": type.googleapis.com/envoy.extensions.filters.http.llm_inference.v3.modelChosen
+          usemodel: "qwen2"
+          first_byte_timeout : 4
+          inference_timeout : 90
+      direct_response:
+        status: 504
+        body:
+          inline_string: "inference timeout"
+```
+其中
+usemodel: 表示使用的模型，模型名字与modelpath里面设置的要对应
+first_byte_timeout: 表示首字节超时时间
+inference_timeout: 表示总推理超时时间
+
+### 更新 Filter
+本项目可以动态地加载和卸载使用模型，您只需添加或删除chat_modelpath、embedding_modelpath里面的模型文件路径，再更新配置文件，即可动态地加载和卸载模型。需要注意的是，卸载了模型之后要确保router级配置里面使用的模型没有被卸载。
+
+
+## 使用注意事项
+
+1. **参数设置**：请根据具体场景调整 `n_threads` 、`n_parallel`的参数，以确保最佳性能。
+2. **模型选用**：确保模型在本地中的路径是正确的，否则加载模型的时候会报错；同时需要用户区分该模型是否是embedding模型。
+3. **并发处理**：确保服务器具有足够的内存和cpu资源，因为一般模型都有几个GB，同时模型推理是一个计算密集型任务，它需要在大量的数据上进行矩阵运算和张量操作。
+
+## 性能对比与测试
+
+为了评估 `filter` 的性能，现与 Ollama 进行以下对比：
+
+### 1. 相同模型与问题
+
+确保在相同模型和问题的条件下进行测试，使用以下步骤：
+
+- **模型选择**：选择相同的预训练模型。
+  这里我们使用alibaba的**qwen2.5-7b-instruct-q3_k_m.gguf**模型
+- **输入问题**：使用相同的输入数据进行推理。
+  这里我们相同的请求，要求最多生成500个词:
+```
+curl http://localhost:10000/v1/chat/completions \
+  -H "host:api.openai.com" \
+  -d '{
+    "model": "qwen2.5",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello! Building a website can be done in 10 simple steps:"
+      }
+    ],
+    "stream": true,
+    "n_predict": 500
+  }'
+
+```
+### 2. 并发测试
+
+在不同的并发级别下（如 1、4、8 个请求）进行性能测试，并记录以下指标：
+
+- **资源开销**：内存使用情况。
+- **响应延迟**：每个请求的响应时间。
+- **推理延迟**：每个请求的推理时间。
+
+其中，4、8个请求的时候，我们把内存使用、延迟时间求平均值作为指标
+### 3. cpu核数设置与数据记录
+- cpu使用8核，即n_threads = 8
+- 使用性能监控工具(htop)记录资源使用情况。
+- 记录时间并进行对比分析。
+
+### 4. 对比结果
+- **内存资源开销**
+
+并发请求数     |  LLM Inference Filter     | Ollama
+-------- |-------- | -----
+1  | 7.1GB | 7.1GB
+4  | 7.2GB| 7.2GB
+8  | 7.2GB| 7.2GB
+
+- **响应延迟**
+
+并发请求数     |  LLM Inference Filter     | Ollama
+-------- |-------- | -----
+1  | 2633.20 ms /    34 tokens |  1336.57 ms / 15 tokens
+4  | 2873.74 ms / 34 tokens | 2196.26 ms / 15 tokens
+8  | 2969.98 ms / 34 tokens | 2077.51 ms / 15 tokens
+
+- **推理延迟**
+
+并发请求数     |  LLM Inference Filter     | Ollama
+-------- |-------- | -----
+1  | 55543.16 ms |  62373.26 ms
+4  | 169539.01 ms| 231860.54ms
+8  | 316113.34 ms | 477764.59 ms
+
+
+## 结论
+
+
+
+通过上述方法，您可以有效地配置和使用 `filter`，并与 Ollama 在性能上进行对比。欢迎提交反馈和建议，以帮助我们持续改进项目。
+
diff --git a/contrib/llm_inference/filters/http/source/BUILD b/contrib/llm_inference/filters/http/source/BUILD
@@ -0,0 +1,37 @@
+load(
+    "//bazel:envoy_build_system.bzl",
+    "envoy_cc_contrib_extension",
+    "envoy_cc_library",
+    "envoy_contrib_package",
+)
+
+licenses(["notice"])  # Apache 2
+
+envoy_contrib_package()
+
+envoy_cc_library(
+    name = "llm_inference_filter_lib",
+    srcs = ["llm_inference_filter.cc"],
+    hdrs = ["llm_inference_filter.h"],
+    deps = [
+        "@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
+        "//source/extensions/filters/http/common:pass_through_filter_lib",
+        "//contrib/llm_inference/filters/http/source/inference:inference",
+        "//source/common/http:header_map_lib",
+        "//source/common/http:header_utility_lib",
+        "//source/common/http:headers_lib",
+        "//source/common/protobuf:utility_lib",
+    ],
+)
+
+envoy_cc_contrib_extension(
+    name = "config",
+    srcs = ["config.cc"],
+    hdrs = ["config.h"],
+    deps = [
+        ":llm_inference_filter_lib",
+        "//envoy/registry",
+        "//source/extensions/filters/http/common:factory_base_lib",
+        "@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
+    ],
+)