Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2024OSPP——基于AI网关实现AI模型的轻量化部署 #1

1 change: 1 addition & 0 deletions api/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ proto_library(
"//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/squash/v3:pkg",
"//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
"//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",
"//contrib/envoy/extensions/filters/network/generic_proxy/action/v3:pkg",
"//contrib/envoy/extensions/filters/network/generic_proxy/codecs/dubbo/v3:pkg",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py.

load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package")

licenses(["notice"]) # Apache 2

api_proto_package(
deps = ["@com_github_cncf_udpa//udpa/annotations:pkg"],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
syntax = "proto3";

package envoy.extensions.filters.http.llm_inference.v3;

import "udpa/annotations/status.proto";
import "validate/validate.proto";

option java_package = "io.envoyproxy.envoy.extensions.filters.http.llm_inference.v3";
option java_outer_classname = "LlmInferenceProto";
option java_multiple_files = true;
option go_package = "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/http/llm_inference/v3;llm_inferencev3";
option (udpa.annotations.file_status).package_version_status = ACTIVE;

message modelParameter {
int32 n_threads = 1;

int32 n_parallel = 2;

map<string, string> chat_modelpath = 3;

map<string, string> embedding_modelpath = 4;
}

message modelChosen {
string usemodel = 1;

int32 first_byte_timeout = 2;

int32 inference_timeout = 3;
}
1 change: 1 addition & 0 deletions api/versioning/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ proto_library(
"//contrib/envoy/extensions/config/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/golang/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
"//contrib/envoy/extensions/filters/http/squash/v3:pkg",
"//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
"//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",
Expand Down
20 changes: 20 additions & 0 deletions bazel/foreign_cc/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -570,3 +570,23 @@ envoy_cmake(
}),
working_directory = "build/cmake",
)

envoy_cmake(
name = "llama",
cache_entries = {
"CMAKE_INSTALL_LIBDIR": "lib",
"BUILD_SHARED_LIBS": "off",
"GGML_OPENMP": "off",
},
lib_source = "@com_github_ggerganov_llama//:all",
out_static_libs = select({
"//conditions:default": [
"libllama.a",
"libggml.a",
],
}),
tags = ["skip_on_windows"],
postfix_script = select({
"//conditions:default": "rm -rf $INSTALLDIR/include/common && mkdir $INSTALLDIR/include/common && cp -rL $EXT_BUILD_ROOT/external/com_github_ggerganov_llama/common/* $INSTALLDIR/include/common",
}),
)
12 changes: 12 additions & 0 deletions bazel/repositories.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def envoy_dependencies(skip_targets = []):
_com_github_google_libprotobuf_mutator()
_com_github_google_libsxg()
_com_github_google_tcmalloc()
_com_github_ggerganov_llama()
_com_github_gperftools_gperftools()
_com_github_grpc_grpc()
_com_github_unicode_org_icu()
Expand Down Expand Up @@ -1238,6 +1239,17 @@ def _com_github_google_tcmalloc():
actual = "@com_github_google_tcmalloc//tcmalloc:malloc_extension",
)

def _com_github_ggerganov_llama():
external_http_archive(
name = "com_github_ggerganov_llama",
build_file_content = BUILD_ALL_CONTENT,
)

native.bind(
name = "llama",
actual = "@envoy//bazel/foreign_cc:llama",
)

def _com_github_gperftools_gperftools():
external_http_archive(
name = "com_github_gperftools_gperftools",
Expand Down
12 changes: 12 additions & 0 deletions bazel/repository_locations.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,18 @@ REPOSITORY_LOCATIONS_SPEC = dict(
license = "Apache-2.0",
license_url = "https://github.com/google/tcmalloc/blob/{version}/LICENSE",
),
com_github_ggerganov_llama = dict(
project_name = "llama.cpp",
project_desc = "LLM inference in C/C++",
project_url = "https://github.com/ggerganov/llama.cpp",
version = "a07c32ea54850c989f0ef6989da5b955b77b7172",
sha256 = "4a5aaa9f4329dc5364ff6e4eea9ee977adce24051f5a6ba099faaaaa57a47149",
strip_prefix = "llama.cpp-{version}",
urls = ["https://github.com/ggerganov/llama.cpp/archive/{version}.zip"],
use_category = ["dataplane_core"],
release_date = "2024-08-23",
cpe = "N/A",
),
com_github_gperftools_gperftools = dict(
project_name = "gperftools",
project_desc = "tcmalloc and profiling libraries",
Expand Down
1 change: 1 addition & 0 deletions contrib/contrib_build_config.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ CONTRIB_EXTENSIONS = {
"envoy.filters.http.language": "//contrib/language/filters/http/source:config_lib",
"envoy.filters.http.squash": "//contrib/squash/filters/http/source:config",
"envoy.filters.http.sxg": "//contrib/sxg/filters/http/source:config",
"envoy.filters.http.llm_inference": "//contrib/llm_inference/filters/http/source:config",

#
# Upstreams
Expand Down
5 changes: 5 additions & 0 deletions contrib/extensions_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ envoy.filters.http.sxg:
- envoy.filters.http
security_posture: robust_to_untrusted_downstream
status: alpha
envoy.filters.http.llm_inference:
categories:
- envoy.filters.http
security_posture: requires_trusted_downstream_and_upstream
status: wip
envoy.filters.network.client_ssl_auth:
categories:
- envoy.filters.network
Expand Down
37 changes: 37 additions & 0 deletions contrib/llm_inference/filters/http/source/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
load(
"//bazel:envoy_build_system.bzl",
"envoy_cc_contrib_extension",
"envoy_cc_library",
"envoy_contrib_package",
)

licenses(["notice"]) # Apache 2

envoy_contrib_package()

envoy_cc_library(
name = "llm_inference_filter_lib",
srcs = ["llm_inference_filter.cc"],
hdrs = ["llm_inference_filter.h"],
deps = [
"@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
"//source/extensions/filters/http/common:pass_through_filter_lib",
"//contrib/llm_inference/filters/http/source/inference:inference",
"//source/common/http:header_map_lib",
"//source/common/http:header_utility_lib",
"//source/common/http:headers_lib",
"//source/common/protobuf:utility_lib",
],
)

envoy_cc_contrib_extension(
name = "config",
srcs = ["config.cc"],
hdrs = ["config.h"],
deps = [
":llm_inference_filter_lib",
"//envoy/registry",
"//source/extensions/filters/http/common:factory_base_lib",
"@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
],
)
89 changes: 89 additions & 0 deletions contrib/llm_inference/filters/http/source/config.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#include "contrib/llm_inference/filters/http/source/config.h"

#include "contrib/llm_inference/filters/http/source/llm_inference_filter.h"
#include <string>

namespace Envoy {
namespace Extensions {
namespace HttpFilters {
namespace LLMInference {

class InferenceSingleton : public Envoy::Singleton::Instance {
public:
InferenceSingleton(Thread::ThreadFactory& thread_factory)
: inference_thread_(thread_factory) {}

std::shared_ptr<InferenceContext> load(std::shared_ptr<InferenceSingleton> singleton, const ModelParameter& model_parameter,
const std::string& model_name, const std::string& model_path, bool embedding) {
YJQ1101 marked this conversation as resolved.
Show resolved Hide resolved
std::shared_ptr<InferenceContext> ctx;
std::string model = model_name + " " + std::to_string(model_parameter.n_threads) + " " + std::to_string(model_parameter.n_parallel);
auto it = ctx_.find(model);
if (it != ctx_.end()) {
ctx = it->second.lock();
}
if (!ctx) {
ctx = std::make_shared<InferenceContext>(singleton, inference_thread_, model_parameter, model_name, model_path, embedding);
}
return ctx;
}

private:
InferenceThread inference_thread_;
absl::flat_hash_map<std::string, std::weak_ptr<InferenceContext>> ctx_;
};

SINGLETON_MANAGER_REGISTRATION(http_inference_singleton);

Http::FilterFactoryCb LLMInferenceFilterConfigFactory::createFilterFactoryFromProtoTyped(
const envoy::extensions::filters::http::llm_inference::v3::modelParameter& proto_config,
const std::string&, Server::Configuration::FactoryContext& context) {

LLMInferenceFilterConfigSharedPtr config =
std::make_shared<LLMInferenceFilterConfig>(LLMInferenceFilterConfig(proto_config));

std::shared_ptr<InferenceSingleton> inference =
context.singletonManager().getTyped<InferenceSingleton>(
SINGLETON_MANAGER_REGISTERED_NAME(http_inference_singleton), [&context] {
return std::make_shared<InferenceSingleton>(context.api().threadFactory());
});

absl::flat_hash_map<std::string, InferenceContextSharedPtr> ctx;

auto chat_modelpath = config->chatModelPath();

for (auto& model: chat_modelpath) {
ctx[model.first] = inference->load(inference, config->modelParameter(), model.first, model.second, false);
}

auto embedding_modelpath = config->embeddingModelPath();

for (auto& model: embedding_modelpath) {
ctx[model.first] = inference->load(inference, config->modelParameter(), model.first, model.second, true);
}

InferenceContextHashMapSharedPtr ctx_map = std::make_shared<absl::flat_hash_map<std::string, InferenceContextSharedPtr>>(ctx);

return [config, ctx_map](Http::FilterChainFactoryCallbacks& callbacks) -> void {
callbacks.addStreamDecoderFilter(std::make_shared<LLMInferenceFilter>(config, ctx_map));
};
}


Router::RouteSpecificFilterConfigConstSharedPtr LLMInferenceFilterConfigFactory::createRouteSpecificFilterConfigTyped(
const envoy::extensions::filters::http::llm_inference::v3::modelChosen& proto_config,
Server::Configuration::ServerFactoryContext&, ProtobufMessage::ValidationVisitor&) {
LLMInferenceFilterConfigPerRouteSharedPtr config =
std::make_shared<LLMInferenceFilterConfigPerRoute>(LLMInferenceFilterConfigPerRoute(proto_config));

return config;
}

/**
* Static registration for this llm inference filter. @see RegisterFactory.
*/
REGISTER_FACTORY(LLMInferenceFilterConfigFactory, Server::Configuration::NamedHttpFilterConfigFactory);

} // namespace LLMInference
} // namespace HttpFilters
} // namespace Extensions
} // namespace Envoy
37 changes: 37 additions & 0 deletions contrib/llm_inference/filters/http/source/config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#pragma once

#include "contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.pb.h"
#include "contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.pb.validate.h"

#include "source/extensions/filters/http/common/factory_base.h"

namespace Envoy {
namespace Extensions {
namespace HttpFilters {
namespace LLMInference {

/**
* Config registration for the inference filter. @see NamedHttpFilterConfigFactory.
*/
class LLMInferenceFilterConfigFactory
: public Common::FactoryBase<envoy::extensions::filters::http::llm_inference::v3::modelParameter,
envoy::extensions::filters::http::llm_inference::v3::modelChosen> {
public:
LLMInferenceFilterConfigFactory() : FactoryBase("envoy.filters.http.llm_inference") {}

private:
Http::FilterFactoryCb createFilterFactoryFromProtoTyped(
const envoy::extensions::filters::http::llm_inference::v3::modelParameter& proto_config,
const std::string&,
Server::Configuration::FactoryContext&) override;

Router::RouteSpecificFilterConfigConstSharedPtr createRouteSpecificFilterConfigTyped(
const envoy::extensions::filters::http::llm_inference::v3::modelChosen& proto_config,
Server::Configuration::ServerFactoryContext&, ProtobufMessage::ValidationVisitor&) override;

};

} // namespace LLMInference
} // namespace HttpFilters
} // namespace Extensions
} // namespace Envoy
27 changes: 27 additions & 0 deletions contrib/llm_inference/filters/http/source/inference/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
load(
"@envoy//bazel:envoy_build_system.bzl",
"envoy_cc_library",
)

licenses(["notice"]) # Apache 2

envoy_cc_library(
name = "inference",
srcs = [
"inference_context.cc",
"inference_task.cc",
"inference_thread.cc",
],
hdrs = [
"inference_context.h",
"inference_task.h",
"inference_thread.h",
"utils.hpp",
],
deps = [
"//source/extensions/filters/http/common:factory_base_lib",
"@com_google_absl//absl/base",
],
visibility = ["//visibility:public"],
external_deps = ["llama"],
)
Loading