higress-group · johnlanni · Oct 8, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/api/BUILD b/api/BUILD
@@ -78,6 +78,7 @@ proto_library(
         "//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
         "//contrib/envoy/extensions/filters/http/squash/v3:pkg",
         "//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
+        "//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
         "//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",
         "//contrib/envoy/extensions/filters/network/generic_proxy/action/v3:pkg",
         "//contrib/envoy/extensions/filters/network/generic_proxy/codecs/dubbo/v3:pkg",

diff --git a/api/contrib/envoy/extensions/filters/http/llm_inference/v3/BUILD b/api/contrib/envoy/extensions/filters/http/llm_inference/v3/BUILD
@@ -0,0 +1,9 @@
+# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py.
+
+load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package")
+
+licenses(["notice"])  # Apache 2
+
+api_proto_package(
+    deps = ["@com_github_cncf_udpa//udpa/annotations:pkg"],
+)
diff --git a/api/contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.proto b/api/contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.proto
@@ -0,0 +1,30 @@
+syntax = "proto3";
+
+package envoy.extensions.filters.http.llm_inference.v3;
+
+import "udpa/annotations/status.proto";
+import "validate/validate.proto";
+
+option java_package = "io.envoyproxy.envoy.extensions.filters.http.llm_inference.v3";
+option java_outer_classname = "LlmInferenceProto";
+option java_multiple_files = true;
+option go_package = "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/http/llm_inference/v3;llm_inferencev3";
+option (udpa.annotations.file_status).package_version_status = ACTIVE;
+
+message modelParameter {
+  int32 n_threads = 1;
+
+  int32 n_parallel = 2;
+
+  map<string, string> chat_modelpath = 3;
+
+  map<string, string> embedding_modelpath = 4;
+}
+
+message modelChosen {
+  string usemodel = 1;
+
+  int32 first_byte_timeout = 2;
+
+  int32 inference_timeout = 3;
+}
diff --git a/api/versioning/BUILD b/api/versioning/BUILD
@@ -15,6 +15,7 @@ proto_library(
         "//contrib/envoy/extensions/config/v3alpha:pkg",
         "//contrib/envoy/extensions/filters/http/golang/v3alpha:pkg",
         "//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
+        "//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
         "//contrib/envoy/extensions/filters/http/squash/v3:pkg",
         "//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
         "//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",

diff --git a/bazel/foreign_cc/BUILD b/bazel/foreign_cc/BUILD
@@ -570,3 +570,23 @@ envoy_cmake(
     }),
     working_directory = "build/cmake",
 )
+
+envoy_cmake(
+    name = "llama",
+    cache_entries = {
+        "CMAKE_INSTALL_LIBDIR": "lib",
+        "BUILD_SHARED_LIBS": "off",
+        "GGML_OPENMP": "off",
+    },
+    lib_source = "@com_github_ggerganov_llama//:all",
+    out_static_libs = select({
+        "//conditions:default": [
+            "libllama.a",
+            "libggml.a",
+        ],
+    }),
+    tags = ["skip_on_windows"],
+    postfix_script = select({
+        "//conditions:default": "rm -rf $INSTALLDIR/include/common && mkdir $INSTALLDIR/include/common && cp -rL $EXT_BUILD_ROOT/external/com_github_ggerganov_llama/common/* $INSTALLDIR/include/common",
+    }),
+)
diff --git a/bazel/repositories.bzl b/bazel/repositories.bzl
@@ -278,6 +278,7 @@ def envoy_dependencies(skip_targets = []):
     _com_github_google_libprotobuf_mutator()
     _com_github_google_libsxg()
     _com_github_google_tcmalloc()
+    _com_github_ggerganov_llama()
     _com_github_gperftools_gperftools()
     _com_github_grpc_grpc()
     _com_github_unicode_org_icu()
@@ -1238,6 +1239,17 @@ def _com_github_google_tcmalloc():
         actual = "@com_github_google_tcmalloc//tcmalloc:malloc_extension",
     )
 
+def _com_github_ggerganov_llama():
+    external_http_archive(
+        name = "com_github_ggerganov_llama",
+        build_file_content = BUILD_ALL_CONTENT,
+    )
+
+    native.bind(
+        name = "llama",
+        actual = "@envoy//bazel/foreign_cc:llama",
+    )
+
 def _com_github_gperftools_gperftools():
     external_http_archive(
         name = "com_github_gperftools_gperftools",

diff --git a/bazel/repository_locations.bzl b/bazel/repository_locations.bzl
@@ -358,6 +358,18 @@ REPOSITORY_LOCATIONS_SPEC = dict(
         license = "Apache-2.0",
         license_url = "https://github.com/google/tcmalloc/blob/{version}/LICENSE",
     ),
+    com_github_ggerganov_llama = dict(
+        project_name = "llama.cpp",
+        project_desc = "LLM inference in C/C++",
+        project_url = "https://github.com/ggerganov/llama.cpp",
+        version = "a07c32ea54850c989f0ef6989da5b955b77b7172",
+        sha256 = "4a5aaa9f4329dc5364ff6e4eea9ee977adce24051f5a6ba099faaaaa57a47149",
+        strip_prefix = "llama.cpp-{version}",
+        urls = ["https://github.com/ggerganov/llama.cpp/archive/{version}.zip"],
+        use_category = ["dataplane_core"],
+        release_date = "2024-08-23",
+        cpe = "N/A",
+    ),
     com_github_gperftools_gperftools = dict(
         project_name = "gperftools",
         project_desc = "tcmalloc and profiling libraries",

diff --git a/contrib/contrib_build_config.bzl b/contrib/contrib_build_config.bzl
@@ -10,6 +10,7 @@ CONTRIB_EXTENSIONS = {
     "envoy.filters.http.language":                              "//contrib/language/filters/http/source:config_lib",
     "envoy.filters.http.squash":                                "//contrib/squash/filters/http/source:config",
     "envoy.filters.http.sxg":                                   "//contrib/sxg/filters/http/source:config",
+    "envoy.filters.http.llm_inference":                         "//contrib/llm_inference/filters/http/source:config",
 
     #
     # Upstreams

diff --git a/contrib/extensions_metadata.yaml b/contrib/extensions_metadata.yaml
@@ -28,6 +28,11 @@ envoy.filters.http.sxg:
   - envoy.filters.http
   security_posture: robust_to_untrusted_downstream
   status: alpha
+envoy.filters.http.llm_inference:
+  categories:
+  - envoy.filters.http
+  security_posture: requires_trusted_downstream_and_upstream
+  status: wip
 envoy.filters.network.client_ssl_auth:
   categories:
   - envoy.filters.network

diff --git a/contrib/llm_inference/filters/http/source/BUILD b/contrib/llm_inference/filters/http/source/BUILD
@@ -0,0 +1,37 @@
+load(
+    "//bazel:envoy_build_system.bzl",
+    "envoy_cc_contrib_extension",
+    "envoy_cc_library",
+    "envoy_contrib_package",
+)
+
+licenses(["notice"])  # Apache 2
+
+envoy_contrib_package()
+
+envoy_cc_library(
+    name = "llm_inference_filter_lib",
+    srcs = ["llm_inference_filter.cc"],
+    hdrs = ["llm_inference_filter.h"],
+    deps = [
+        "@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
+        "//source/extensions/filters/http/common:pass_through_filter_lib",
+        "//contrib/llm_inference/filters/http/source/inference:inference",
+        "//source/common/http:header_map_lib",
+        "//source/common/http:header_utility_lib",
+        "//source/common/http:headers_lib",
+        "//source/common/protobuf:utility_lib",
+    ],
+)
+
+envoy_cc_contrib_extension(
+    name = "config",
+    srcs = ["config.cc"],
+    hdrs = ["config.h"],
+    deps = [
+        ":llm_inference_filter_lib",
+        "//envoy/registry",
+        "//source/extensions/filters/http/common:factory_base_lib",
+        "@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
+    ],
+)
diff --git a/contrib/llm_inference/filters/http/source/config.cc b/contrib/llm_inference/filters/http/source/config.cc
@@ -0,0 +1,89 @@
+#include "contrib/llm_inference/filters/http/source/config.h"
+
+#include "contrib/llm_inference/filters/http/source/llm_inference_filter.h"
+#include <string>
+
+namespace Envoy {
+namespace Extensions {
+namespace HttpFilters {
+namespace LLMInference {
+
+class InferenceSingleton : public Envoy::Singleton::Instance {
+public:
+  InferenceSingleton(Thread::ThreadFactory& thread_factory)
+      : inference_thread_(thread_factory) {}
+
+  std::shared_ptr<InferenceContext> load(std::shared_ptr<InferenceSingleton> singleton, const ModelParameter& model_parameter,
+              const std::string& model_name, const std::string& model_path, bool embedding) {
+    std::shared_ptr<InferenceContext> ctx;
+    std::string model = model_name + " " + std::to_string(model_parameter.n_threads) + " " + std::to_string(model_parameter.n_parallel);
+    auto it = ctx_.find(model);
+    if (it != ctx_.end()) {
+      ctx = it->second.lock();
+    }
+    if (!ctx) {
+      ctx = std::make_shared<InferenceContext>(singleton, inference_thread_, model_parameter, model_name, model_path, embedding);
+    }
+    return ctx;
+  }
+
+private:
+  InferenceThread inference_thread_;
+  absl::flat_hash_map<std::string, std::weak_ptr<InferenceContext>> ctx_;
+};
+
+SINGLETON_MANAGER_REGISTRATION(http_inference_singleton);
+
+Http::FilterFactoryCb LLMInferenceFilterConfigFactory::createFilterFactoryFromProtoTyped(
+    const envoy::extensions::filters::http::llm_inference::v3::modelParameter& proto_config,
+    const std::string&, Server::Configuration::FactoryContext& context) {
+
+    LLMInferenceFilterConfigSharedPtr config =
+        std::make_shared<LLMInferenceFilterConfig>(LLMInferenceFilterConfig(proto_config));
+
+    std::shared_ptr<InferenceSingleton> inference =
+        context.singletonManager().getTyped<InferenceSingleton>(
+            SINGLETON_MANAGER_REGISTERED_NAME(http_inference_singleton), [&context] {
+              return std::make_shared<InferenceSingleton>(context.api().threadFactory());
+            });
+
+    absl::flat_hash_map<std::string, InferenceContextSharedPtr> ctx;
+
+    auto chat_modelpath = config->chatModelPath();
+
+    for (auto& model: chat_modelpath) {
+      ctx[model.first] = inference->load(inference, config->modelParameter(), model.first, model.second, false);
+    }
+
+    auto embedding_modelpath = config->embeddingModelPath();
+
+    for (auto& model: embedding_modelpath) {
+      ctx[model.first] = inference->load(inference, config->modelParameter(), model.first, model.second, true);
+    }
+
+    InferenceContextHashMapSharedPtr ctx_map = std::make_shared<absl::flat_hash_map<std::string, InferenceContextSharedPtr>>(ctx);
+
+    return [config, ctx_map](Http::FilterChainFactoryCallbacks& callbacks) -> void {
+      callbacks.addStreamDecoderFilter(std::make_shared<LLMInferenceFilter>(config, ctx_map));
+    };
+}
+
+
+Router::RouteSpecificFilterConfigConstSharedPtr LLMInferenceFilterConfigFactory::createRouteSpecificFilterConfigTyped(
+    const envoy::extensions::filters::http::llm_inference::v3::modelChosen& proto_config,
+    Server::Configuration::ServerFactoryContext&, ProtobufMessage::ValidationVisitor&) {
+    LLMInferenceFilterConfigPerRouteSharedPtr config = 
+        std::make_shared<LLMInferenceFilterConfigPerRoute>(LLMInferenceFilterConfigPerRoute(proto_config));
+
+    return config;
+}
+
+/**
+ * Static registration for this llm inference filter. @see RegisterFactory.
+ */
+REGISTER_FACTORY(LLMInferenceFilterConfigFactory, Server::Configuration::NamedHttpFilterConfigFactory);
+
+} // namespace LLMInference
+} // namespace HttpFilters
+} // namespace Extensions
+} // namespace Envoy
diff --git a/contrib/llm_inference/filters/http/source/config.h b/contrib/llm_inference/filters/http/source/config.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.pb.h"
+#include "contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.pb.validate.h"
+
+#include "source/extensions/filters/http/common/factory_base.h"
+
+namespace Envoy {
+namespace Extensions {
+namespace HttpFilters {
+namespace LLMInference {
+
+/**
+ * Config registration for the inference filter. @see NamedHttpFilterConfigFactory.
+ */
+class LLMInferenceFilterConfigFactory
+    : public Common::FactoryBase<envoy::extensions::filters::http::llm_inference::v3::modelParameter,
+                                 envoy::extensions::filters::http::llm_inference::v3::modelChosen>  {
+public:
+  LLMInferenceFilterConfigFactory() : FactoryBase("envoy.filters.http.llm_inference") {}
+
+private:
+  Http::FilterFactoryCb createFilterFactoryFromProtoTyped(
+      const envoy::extensions::filters::http::llm_inference::v3::modelParameter& proto_config,
+      const std::string&,
+      Server::Configuration::FactoryContext&) override;
+
+  Router::RouteSpecificFilterConfigConstSharedPtr createRouteSpecificFilterConfigTyped(
+      const envoy::extensions::filters::http::llm_inference::v3::modelChosen& proto_config,
+      Server::Configuration::ServerFactoryContext&, ProtobufMessage::ValidationVisitor&) override;
+
+};
+
+} // namespace LLMInference
+} // namespace HttpFilters
+} // namespace Extensions
+} // namespace Envoy
diff --git a/contrib/llm_inference/filters/http/source/inference/BUILD b/contrib/llm_inference/filters/http/source/inference/BUILD
@@ -0,0 +1,27 @@
+load(
+    "@envoy//bazel:envoy_build_system.bzl",
+    "envoy_cc_library",
+)
+
+licenses(["notice"])  # Apache 2
+
+envoy_cc_library(
+    name = "inference",
+    srcs = [
+        "inference_context.cc",
+        "inference_task.cc",
+        "inference_thread.cc",
+    ],
+    hdrs = [
+        "inference_context.h",
+        "inference_task.h",
+        "inference_thread.h",
+        "utils.hpp",
+    ],
+    deps = [
+        "//source/extensions/filters/http/common:factory_base_lib",
+        "@com_google_absl//absl/base",
+    ],
+    visibility = ["//visibility:public"],
+    external_deps = ["llama"],
+)