Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2024OSPP——基于AI网关实现AI模型的轻量化部署 #1

1 change: 1 addition & 0 deletions api/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ proto_library(
"//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/squash/v3:pkg",
"//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
"//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",
"//contrib/envoy/extensions/filters/network/generic_proxy/action/v3:pkg",
"//contrib/envoy/extensions/filters/network/generic_proxy/codecs/dubbo/v3:pkg",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py.

load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package")

licenses(["notice"]) # Apache 2

api_proto_package(
deps = ["@com_github_cncf_udpa//udpa/annotations:pkg"],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
syntax = "proto3";

package envoy.extensions.filters.http.llm_inference.v3;

import "udpa/annotations/status.proto";
import "validate/validate.proto";

option java_package = "io.envoyproxy.envoy.extensions.filters.http.llm_inference.v3";
option java_outer_classname = "LlmInferenceProto";
option java_multiple_files = true;
option go_package = "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/http/llm_inference/v3;llm_inferencev3";
option (udpa.annotations.file_status).package_version_status = ACTIVE;

message modelParameter {
int32 n_threads = 1;

int32 n_parallel = 2;

map<string, string> modelpath = 3;
}

message modelChosen {
string usemodel = 1;

int32 first_byte_timeout = 2;

int32 inference_timeout = 3;

bool embedding = 4;
}
1 change: 1 addition & 0 deletions api/versioning/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ proto_library(
"//contrib/envoy/extensions/config/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/golang/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/language/v3alpha:pkg",
"//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg",
"//contrib/envoy/extensions/filters/http/squash/v3:pkg",
"//contrib/envoy/extensions/filters/http/sxg/v3alpha:pkg",
"//contrib/envoy/extensions/filters/network/client_ssl_auth/v3:pkg",
Expand Down
20 changes: 20 additions & 0 deletions bazel/foreign_cc/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -570,3 +570,23 @@ envoy_cmake(
}),
working_directory = "build/cmake",
)

envoy_cmake(
name = "llama",
cache_entries = {
"CMAKE_INSTALL_LIBDIR": "lib",
"BUILD_SHARED_LIBS": "off",
"GGML_OPENMP": "off",
},
lib_source = "@com_github_ggerganov_llama//:all",
out_static_libs = select({
"//conditions:default": [
"libllama.a",
"libggml.a",
],
}),
tags = ["skip_on_windows"],
postfix_script = select({
"//conditions:default": "rm -rf $INSTALLDIR/include/common && mkdir $INSTALLDIR/include/common && cp -rL $EXT_BUILD_ROOT/external/com_github_ggerganov_llama/common/* $INSTALLDIR/include/common",
}),
)
12 changes: 12 additions & 0 deletions bazel/repositories.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def envoy_dependencies(skip_targets = []):
_com_github_google_libprotobuf_mutator()
_com_github_google_libsxg()
_com_github_google_tcmalloc()
_com_github_ggerganov_llama()
_com_github_gperftools_gperftools()
_com_github_grpc_grpc()
_com_github_unicode_org_icu()
Expand Down Expand Up @@ -1238,6 +1239,17 @@ def _com_github_google_tcmalloc():
actual = "@com_github_google_tcmalloc//tcmalloc:malloc_extension",
)

def _com_github_ggerganov_llama():
external_http_archive(
name = "com_github_ggerganov_llama",
build_file_content = BUILD_ALL_CONTENT,
)

native.bind(
name = "llama",
actual = "@envoy//bazel/foreign_cc:llama",
)

def _com_github_gperftools_gperftools():
external_http_archive(
name = "com_github_gperftools_gperftools",
Expand Down
12 changes: 12 additions & 0 deletions bazel/repository_locations.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,18 @@ REPOSITORY_LOCATIONS_SPEC = dict(
license = "Apache-2.0",
license_url = "https://github.com/google/tcmalloc/blob/{version}/LICENSE",
),
com_github_ggerganov_llama = dict(
project_name = "llama.cpp",
project_desc = "LLM inference in C/C++",
project_url = "https://github.com/ggerganov/llama.cpp",
version = "a07c32ea54850c989f0ef6989da5b955b77b7172",
sha256 = "4a5aaa9f4329dc5364ff6e4eea9ee977adce24051f5a6ba099faaaaa57a47149",
strip_prefix = "llama.cpp-{version}",
urls = ["https://github.com/ggerganov/llama.cpp/archive/{version}.zip"],
use_category = ["dataplane_core"],
release_date = "2024-08-23",
cpe = "N/A",
),
com_github_gperftools_gperftools = dict(
project_name = "gperftools",
project_desc = "tcmalloc and profiling libraries",
Expand Down
1 change: 1 addition & 0 deletions contrib/contrib_build_config.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ CONTRIB_EXTENSIONS = {
"envoy.filters.http.language": "//contrib/language/filters/http/source:config_lib",
"envoy.filters.http.squash": "//contrib/squash/filters/http/source:config",
"envoy.filters.http.sxg": "//contrib/sxg/filters/http/source:config",
"envoy.filters.http.llm_inference": "//contrib/llm_inference/filters/http/source:config",

#
# Upstreams
Expand Down
5 changes: 5 additions & 0 deletions contrib/extensions_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ envoy.filters.http.sxg:
- envoy.filters.http
security_posture: robust_to_untrusted_downstream
status: alpha
envoy.filters.http.llm_inference:
categories:
- envoy.filters.http
security_posture: requires_trusted_downstream_and_upstream
status: wip
envoy.filters.network.client_ssl_auth:
categories:
- envoy.filters.network
Expand Down
37 changes: 37 additions & 0 deletions contrib/llm_inference/filters/http/source/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
load(
"//bazel:envoy_build_system.bzl",
"envoy_cc_contrib_extension",
"envoy_cc_library",
"envoy_contrib_package",
)

licenses(["notice"]) # Apache 2

envoy_contrib_package()

envoy_cc_library(
name = "llm_inference_filter_lib",
srcs = ["llm_inference_filter.cc"],
hdrs = ["llm_inference_filter.h"],
deps = [
"@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
"//source/extensions/filters/http/common:pass_through_filter_lib",
"//contrib/llm_inference/filters/http/source/inference:inference",
"//source/common/http:header_map_lib",
"//source/common/http:header_utility_lib",
"//source/common/http:headers_lib",
"//source/common/protobuf:utility_lib",
],
)

envoy_cc_contrib_extension(
name = "config",
srcs = ["config.cc"],
hdrs = ["config.h"],
deps = [
":llm_inference_filter_lib",
"//envoy/registry",
"//source/extensions/filters/http/common:factory_base_lib",
"@envoy_api//contrib/envoy/extensions/filters/http/llm_inference/v3:pkg_cc_proto",
],
)
82 changes: 82 additions & 0 deletions contrib/llm_inference/filters/http/source/config.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#include "contrib/llm_inference/filters/http/source/config.h"

#include "contrib/llm_inference/filters/http/source/llm_inference_filter.h"
#include <string>

namespace Envoy {
namespace Extensions {
namespace HttpFilters {
namespace LLMInference {

class InferenceSingleton : public Envoy::Singleton::Instance {
public:
InferenceSingleton(Thread::ThreadFactory& thread_factory)
: inference_thread_(thread_factory) {}

std::shared_ptr<InferenceContext> load(std::shared_ptr<InferenceSingleton> singleton, const ModelParameter& model_parameter,
const ModelChosen& model_chosen, const std::string& model_path) {
std::shared_ptr<InferenceContext> ctx;
absl::MutexLock lock(&mu_);
YJQ1101 marked this conversation as resolved.
Show resolved Hide resolved
auto it = ctx_.find(model_chosen.model_name);
if (it != ctx_.end()) {
ctx = it->second.lock();
}
if (!ctx) {
ctx = std::make_shared<InferenceContext>(singleton, inference_thread_, model_parameter, model_path, model_chosen);
ctx_[model_chosen.model_name] = ctx;
}
return ctx;
}

private:
InferenceThread inference_thread_;
absl::Mutex mu_;
absl::flat_hash_map<std::string, std::weak_ptr<InferenceContext>> ctx_ ABSL_GUARDED_BY(mu_);
};

SINGLETON_MANAGER_REGISTRATION(http_inference_singleton);

Http::FilterFactoryCb LLMInferenceFilterConfigFactory::createFilterFactoryFromProtoTyped(
const envoy::extensions::filters::http::llm_inference::v3::modelParameter& proto_config,
const std::string&, Server::Configuration::FactoryContext& context) {

LLMInferenceFilterConfigSharedPtr config =
std::make_shared<LLMInferenceFilterConfig>(LLMInferenceFilterConfig(proto_config));

std::shared_ptr<InferenceSingleton> inference =
context.singletonManager().getTyped<InferenceSingleton>(
SINGLETON_MANAGER_REGISTERED_NAME(http_inference_singleton), [&context] {
return std::make_shared<InferenceSingleton>(context.api().threadFactory());
});

InferenceContextSharedPtr ctx;
auto modelpath = config->modelPath();
if (modelpath.contains(model_Chosen_.model_name)) {
YJQ1101 marked this conversation as resolved.
Show resolved Hide resolved
ctx = inference->load(inference, config->modelParameter(), model_Chosen_, modelpath[model_Chosen_.model_name]);
YJQ1101 marked this conversation as resolved.
Show resolved Hide resolved
}

return [config, ctx](Http::FilterChainFactoryCallbacks& callbacks) -> void {
callbacks.addStreamDecoderFilter(std::make_shared<LLMInferenceFilter>(config, ctx));
};
}


Router::RouteSpecificFilterConfigConstSharedPtr LLMInferenceFilterConfigFactory::createRouteSpecificFilterConfigTyped(
const envoy::extensions::filters::http::llm_inference::v3::modelChosen& proto_config,
Server::Configuration::ServerFactoryContext&, ProtobufMessage::ValidationVisitor&) {
LLMInferenceFilterConfigPerRouteSharedPtr config =
std::make_shared<LLMInferenceFilterConfigPerRoute>(LLMInferenceFilterConfigPerRoute(proto_config));

model_Chosen_ = config->modelChosen();
YJQ1101 marked this conversation as resolved.
Show resolved Hide resolved
return config;
}

/**
* Static registration for this llm inference filter. @see RegisterFactory.
*/
REGISTER_FACTORY(LLMInferenceFilterConfigFactory, Server::Configuration::NamedHttpFilterConfigFactory);

} // namespace LLMInference
} // namespace HttpFilters
} // namespace Extensions
} // namespace Envoy
39 changes: 39 additions & 0 deletions contrib/llm_inference/filters/http/source/config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#pragma once

#include "contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.pb.h"
#include "contrib/envoy/extensions/filters/http/llm_inference/v3/llm_inference.pb.validate.h"
#include "contrib/llm_inference/filters/http/source/inference/inference_task.h"

#include "source/extensions/filters/http/common/factory_base.h"

namespace Envoy {
namespace Extensions {
namespace HttpFilters {
namespace LLMInference {

/**
* Config registration for the inference filter. @see NamedHttpFilterConfigFactory.
*/
class LLMInferenceFilterConfigFactory
: public Common::FactoryBase<envoy::extensions::filters::http::llm_inference::v3::modelParameter,
envoy::extensions::filters::http::llm_inference::v3::modelChosen> {
public:
LLMInferenceFilterConfigFactory() : FactoryBase("envoy.filters.http.llm_inference") {}

private:
Http::FilterFactoryCb createFilterFactoryFromProtoTyped(
const envoy::extensions::filters::http::llm_inference::v3::modelParameter& proto_config,
const std::string&,
Server::Configuration::FactoryContext&) override;

Router::RouteSpecificFilterConfigConstSharedPtr createRouteSpecificFilterConfigTyped(
const envoy::extensions::filters::http::llm_inference::v3::modelChosen& proto_config,
Server::Configuration::ServerFactoryContext&, ProtobufMessage::ValidationVisitor&) override;

ModelChosen model_Chosen_;
YJQ1101 marked this conversation as resolved.
Show resolved Hide resolved
};

} // namespace LLMInference
} // namespace HttpFilters
} // namespace Extensions
} // namespace Envoy
27 changes: 27 additions & 0 deletions contrib/llm_inference/filters/http/source/inference/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
load(
"@envoy//bazel:envoy_build_system.bzl",
"envoy_cc_library",
)

licenses(["notice"]) # Apache 2

envoy_cc_library(
name = "inference",
srcs = [
"inference_context.cc",
"inference_task.cc",
"inference_thread.cc",
],
hdrs = [
"inference_context.h",
"inference_task.h",
"inference_thread.h",
"utils.hpp",
],
deps = [
"//source/extensions/filters/http/common:factory_base_lib",
"@com_google_absl//absl/base",
],
visibility = ["//visibility:public"],
external_deps = ["llama"],
)
Loading