Skip to content

Commit

Permalink
feat: Add benchmark mode (#73)
Browse files Browse the repository at this point in the history
* Add benchmark route && test code

* improvements

* clean code
  • Loading branch information
KenyonY authored Sep 19, 2023
1 parent ea9579e commit 3ce781f
Show file tree
Hide file tree
Showing 34 changed files with 697 additions and 120 deletions.
9 changes: 5 additions & 4 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
# `LOG_CHAT`: 是否记录日志
LOG_CHAT=false

PRINT_CHAT=true

PRINT_CHAT=false
# `OPENAI_BASE_URL`: 转发openai风格的任何服务地址,允许指定多个, 以逗号隔开。
# 如果指定超过一个,则任何OPENAI_ROUTE_PREFIX/EXTRA_ROUTE_PREFIX都不能为根路由/
OPENAI_BASE_URL=https://api.openai.com
Expand All @@ -15,6 +14,8 @@ OPENAI_ROUTE_PREFIX=
OPENAI_API_KEY=
FORWARD_KEY=

CHAT_COMPLETION_ROUTE=/v1/chat/completions

# `EXTRA_BASE_URL`: 可指定任意服务转发
EXTRA_BASE_URL=
# `EXTRA_ROUTE_PREFIX`: 与 EXTRA_BASE_URL 匹配的路由前缀
Expand All @@ -34,10 +35,10 @@ RATE_LIMIT_STRATEGY=moving-window


# 返回的token速率限制
TOKEN_RATE_LIMIT={"/v1/chat/completions":"40/second"}
TOKEN_RATE_LIMIT={"/v1/chat/completions":"50/second"}


TIMEOUT=300
TIMEOUT=100

IP_BLACKLIST=

Expand Down
13 changes: 9 additions & 4 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
# LOG_CHAT: 是否开启日志
LOG_CHAT=true

PRINT_CHAT=true

BENCHMARK_MODE=true

# OPENAI_BASE_URL: 转发openai风格的任何服务地址,允许指定多个, 以逗号隔开。
# 如果指定超过一个,则任何OPENAI_ROUTE_PREFIX/EXTRA_ROUTE_PREFIX都不能为根路由/
OPENAI_BASE_URL='https://api.openai.com, http://localhost:8080'

# OPENAI_ROUTE_PREFIX: 可指定所有openai风格(为记录日志)服务的转发路由前缀
OPENAI_ROUTE_PREFIX='/openai, /localai'

CHAT_COMPLETION_ROUTE=/openai/v1/chat/completions

# OPENAI_API_KEY:允许输入多个api key, 以逗号隔开, 形成轮询池
OPENAI_API_KEY='sk-xxx1, sk-xxx2, sk-xxx3'

Expand All @@ -30,7 +36,7 @@ REQ_RATE_LIMIT='{
}'

# `GLOBAL_RATE_LIMIT`: 限制所有`REQ_RATE_LIMIT`没有指定的路由. 不填默认无限制
GLOBAL_RATE_LIMIT=2/5seconds
GLOBAL_RATE_LIMIT=inf

#`RATE_LIMIT_STRATEGY` Options: (fixed-window, fixed-window-elastic-expiry, moving-window) ref: https://limits.readthedocs.io/en/latest/strategies.html
# `fixed-window`: most memory efficient strategy; `moving-window`:most effective for preventing bursts but higher memory cost.
Expand All @@ -40,11 +46,10 @@ RATE_LIMIT_STRATEGY=fixed-window
PROXY=http://localhost:7890

# `TOKEN_RATE_LIMIT` 对每一份流式返回的token速率限制 (注:这里的token并不严格等于gpt中定义的token,而是SSE的chunk)
TOKEN_RATE_LIMIT={"/v1/chat/completions":"20/second"}
TOKEN_RATE_LIMIT={"/v1/chat/completions":"20/second", "/benchmark/v1/chat/completions":"500/second"}


TIMEOUT=300
CHAT_COMPLETION_ROUTE=/v1/chat/completions
TIMEOUT=100

# 设定时区
TZ=Asia/Shanghai
48 changes: 48 additions & 0 deletions Examples/benchmark/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# 示例与解释见 .env.example

# `LOG_CHAT`: 是否记录日志
LOG_CHAT=false
PRINT_CHAT=false

BENCHMARK_MODE=true

# `OPENAI_BASE_URL`: 转发openai风格的任何服务地址,允许指定多个, 以逗号隔开。
# 如果指定超过一个,则任何OPENAI_ROUTE_PREFIX/EXTRA_ROUTE_PREFIX都不能为根路由/
OPENAI_BASE_URL=https://xxx.xxx.xxx

# `OPENAI_ROUTE_PREFIX`: 可指定所有openai风格(为记录日志)服务的转发路由前缀
OPENAI_ROUTE_PREFIX=

OPENAI_API_KEY=
FORWARD_KEY=

CHAT_COMPLETION_ROUTE=/benchmark/v1/chat/completions

# `EXTRA_BASE_URL`: 可指定任意服务转发
EXTRA_BASE_URL=
# `EXTRA_ROUTE_PREFIX`: 与 EXTRA_BASE_URL 匹配的路由前缀
EXTRA_ROUTE_PREFIX=

# `REQ_RATE_LIMIT`: i.e. 对指定路由的请求速率限制, 区分用户
# format: {route: ratelimit-string}
# ratelimit-string format [count] [per|/] [n (optional)] [second|minute|hour|day|month|year] :ref:`ratelimit-string`: https://limits.readthedocs.io/en/stable/quickstart.html#rate-limit-string-notation
REQ_RATE_LIMIT={"/healthz":"100/2minutes","/v1/chat/completions":"60/minute;600/hour"}

# `GLOBAL_RATE_LIMIT`: 限制所有`REQ_RATE_LIMIT`没有指定的路由. 不填默认无限制
GLOBAL_RATE_LIMIT=

#`RATE_LIMIT_STRATEGY` Options: (fixed-window, fixed-window-elastic-expiry, moving-window) :ref: https://limits.readthedocs.io/en/latest/strategies.html
# `fixed-window`: most memory efficient strategy; `moving-window`:most effective for preventing bursts but higher memory cost.
RATE_LIMIT_STRATEGY=moving-window


# 返回的token速率限制
TOKEN_RATE_LIMIT={"/v1/chat/completions":"50/second", "/benchmark/v1/chat/completions":"30/second"}


TIMEOUT=5

IP_BLACKLIST=

# 设定时区
TZ=Asia/Shanghai
71 changes: 71 additions & 0 deletions Examples/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@


## 原始benchmark接口测试
> http://localhost:8080
> 该部分测试是为评估`fastapi`本身流式与非流式返回的性能,不涉及转发
启动参数: `BENCHMARK_MODE=true aifd run --workers=n --port 8080` (n=1 or 4)



### stream == false:
```bash
wrk -t8 -c400 -d10s -s post.lua http://localhost:8080/benchmark/v1/chat/completions
```
单核:

![img_7.png](img_7.png)


4核:

![img_8.png](img_8.png)


### stream == true:
```bash
wrk -t8 -c100 -d10s -s post.lua http://localhost:8080/benchmark/v1/chat/completions
```
从下面的测试中可以发现,对于fastapi的流式返回时的Request/sec并不高,这里其实还应该添加不同流式返回时间的测试,
目前流式返回的文本在`cache/chat`中, 下面的结果中没有设置`TOKEN_RATE_LIMIT`,可以添加不同的`TOKEN_RATE_LIMIT`进行测试以模拟实际情况。

单核:

![img_10.png](img_10.png)

4核:

![img.png](img.png)

## 转发benchmark接口
> http://localhost:8000
> 该部分评估流式与非流式转发的性能

启动参数: `OPENAI_BASE_URL=http://localhost:8080 aifd run --workers=n --port 8000` (n=1 or 4)
```bash
wrk -t8 -c100 -d10s -s post.lua http://localhost:8000/benchmark/v1/chat/completions
```

### stream == false:

**单核**

![img_5.png](img_5.png)

**4核**:(原始与转发两边均4核)

![img_2.png](img_2.png)

### stream == true:

**单核**: (是的,转发时的结果比原始的还要好,比较迷~)

![img_4.png](img_4.png)

**4核**:(原始与转发两边均4核)

![img_3.png](img_3.png)



Binary file added Examples/benchmark/img.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_10.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_7.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_8.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Examples/benchmark/img_9.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 10 additions & 0 deletions Examples/benchmark/post.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
wrk.method = "POST"
wrk.headers["Content-Type"] = "application/json"
wrk.body = '{"stream": false}'
-- wrk.body = '{"stream":true}'

-- wrk.timeout = 20000 -- in milliseconds

-- request = function()
-- return wrk.format(nil, nil, nil, wrk.body)
-- end
67 changes: 67 additions & 0 deletions Examples/benchmark/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import asyncio

import openai
from rich import print
from sparrow import MeasureTime, yaml_load # pip install sparrow-python

config = yaml_load("config.yaml", rel_path=True)
print(f"{config=}")
openai.api_base = config["api_base"]
openai.api_key = config["api_key"]

stream = True
# stream = False

is_print = False


async def run(n):
resp = await openai.ChatCompletion.acreate(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": '.'},
],
stream=stream,
request_timeout=60,
)

if stream:
first_chunk = await anext(resp)
if first_chunk is not None:
if is_print:
chunk_message = first_chunk['choices'][0]['delta']
print(f"{chunk_message['role']}: ")
async for chunk in resp:
if is_print:
chunk_message = chunk['choices'][0]['delta']
content = chunk_message.get("content", "")
print(content, end="")
if is_print:
print()
else:
if is_print:
assistant_content = resp.choices[0].message.content
print(assistant_content)
print(resp.usage)

print(f"Task {n} completed")


async def main():
mt = MeasureTime().start()
mean = 0
epochs = 5
for epoch in range(epochs):
tasks = []
for i in range(10): # 创建 x个并发任务
task = asyncio.create_task(run(i))
tasks.append(task)

mt.start()
await asyncio.gather(*tasks)
cost = mt.show_interval(f"{epoch=}")
mean += cost
print(f"mean: {mean / epochs} s")


asyncio.run(main())
32 changes: 22 additions & 10 deletions Examples/chat.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import openai
from rich import print
from sparrow import yaml_load
from sparrow import yaml_load # pip install sparrow-python

config = yaml_load("config.yaml", rel_path=True)
print(f"{config=}")
openai.api_base = config["api_base"]
openai.api_key = config["api_key"]

stream = True
# stream = False
# debug = True
debug = False

user_content = """
用c实现目前已知最快平方根算法
"""
Expand All @@ -16,21 +20,29 @@
model="gpt-3.5-turbo",
# model="gpt-4",
messages=[
{"role": "user", "content": user_content},
{"role": "user", "content": 'hi'},
],
stream=stream,
request_timeout=30,
)

if stream:
chunk_message = next(resp)['choices'][0]['delta']
print(f"{chunk_message['role']}: ")
for chunk in resp:
chunk_message = chunk['choices'][0]['delta']
content = chunk_message.get("content", "")
print(content, end="")
print()
if debug:
for chunk in resp:
print(chunk)
else:
chunk_message = next(resp)['choices'][0]['delta']
print(f"{chunk_message['role']}: ")
for chunk in resp:
chunk_message = chunk['choices'][0]['delta']
content = chunk_message.get("content", "")
print(content, end="")
print()
else:
print(resp.choices)
print(resp)
assistant_content = resp.choices[0].message.content
print(assistant_content)
print(resp.usage)

"""
gpt-4:
Expand Down
4 changes: 2 additions & 2 deletions openai_forward/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__version__ = "0.5.2"
__version__ = "0.5.3"

from dotenv import load_dotenv

load_dotenv(override=False)
load_dotenv('.env', override=False)
6 changes: 1 addition & 5 deletions openai_forward/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,18 @@

class Cli:
@staticmethod
def run(port=8000, workers=1, log_chat=None):
def run(port=8000, workers=1):
"""
Runs the application using the Uvicorn server.
Args:
port (int): The port number on which to run the server. Default is 8000.
workers (int): The number of worker processes to run. Default is 1.
log_chat (str): whether to log llm chat. Default is None.
Returns:
None
"""

if log_chat:
os.environ["LOG_CHAT"] = log_chat

if platform.system() == "Windows":
os.environ["TZ"] = ""

Expand Down
Loading

0 comments on commit 3ce781f

Please sign in to comment.