feat: Add benchmark mode (#73)

* Add benchmark route && test code * improvements * clean code
KenyonY · Sep 19, 2023 · 3ce781f · 3ce781f
1 parent ea9579e
commit 3ce781f
Show file tree

Hide file tree

Showing 34 changed files with 697 additions and 120 deletions.
diff --git a/.env b/.env
@@ -3,8 +3,7 @@
 # `LOG_CHAT`: 是否记录日志
 LOG_CHAT=false
 
-PRINT_CHAT=true
-
+PRINT_CHAT=false
 # `OPENAI_BASE_URL`: 转发openai风格的任何服务地址，允许指定多个, 以逗号隔开。
 # 如果指定超过一个，则任何OPENAI_ROUTE_PREFIX/EXTRA_ROUTE_PREFIX都不能为根路由/
 OPENAI_BASE_URL=https://api.openai.com
@@ -15,6 +14,8 @@ OPENAI_ROUTE_PREFIX=
 OPENAI_API_KEY=
 FORWARD_KEY=
 
+CHAT_COMPLETION_ROUTE=/v1/chat/completions
+
 # `EXTRA_BASE_URL`: 可指定任意服务转发
 EXTRA_BASE_URL=
 # `EXTRA_ROUTE_PREFIX`: 与 EXTRA_BASE_URL 匹配的路由前缀
@@ -34,10 +35,10 @@ RATE_LIMIT_STRATEGY=moving-window
 
 
 # 返回的token速率限制
-TOKEN_RATE_LIMIT={"/v1/chat/completions":"40/second"}
+TOKEN_RATE_LIMIT={"/v1/chat/completions":"50/second"}
 
 
-TIMEOUT=300
+TIMEOUT=100
 
 IP_BLACKLIST=
 

diff --git a/.env.example b/.env.example
@@ -1,13 +1,19 @@
 # LOG_CHAT: 是否开启日志
 LOG_CHAT=true
 
+PRINT_CHAT=true
+
+BENCHMARK_MODE=true
+
 # OPENAI_BASE_URL: 转发openai风格的任何服务地址，允许指定多个, 以逗号隔开。
 # 如果指定超过一个，则任何OPENAI_ROUTE_PREFIX/EXTRA_ROUTE_PREFIX都不能为根路由/
 OPENAI_BASE_URL='https://api.openai.com, http://localhost:8080'
 
 # OPENAI_ROUTE_PREFIX: 可指定所有openai风格(为记录日志)服务的转发路由前缀
 OPENAI_ROUTE_PREFIX='/openai, /localai'
 
+CHAT_COMPLETION_ROUTE=/openai/v1/chat/completions
+
 # OPENAI_API_KEY：允许输入多个api key, 以逗号隔开, 形成轮询池
 OPENAI_API_KEY='sk-xxx1, sk-xxx2, sk-xxx3'
 
@@ -30,7 +36,7 @@ REQ_RATE_LIMIT='{
 }'
 
 # `GLOBAL_RATE_LIMIT`: 限制所有`REQ_RATE_LIMIT`没有指定的路由. 不填默认无限制
-GLOBAL_RATE_LIMIT=2/5seconds
+GLOBAL_RATE_LIMIT=inf
 
 #`RATE_LIMIT_STRATEGY` Options: (fixed-window, fixed-window-elastic-expiry, moving-window) ref: https://limits.readthedocs.io/en/latest/strategies.html
 # `fixed-window`: most memory efficient strategy; `moving-window`:most effective for preventing bursts but higher memory cost.
@@ -40,11 +46,10 @@ RATE_LIMIT_STRATEGY=fixed-window
 PROXY=http://localhost:7890
 
 # `TOKEN_RATE_LIMIT` 对每一份流式返回的token速率限制 (注：这里的token并不严格等于gpt中定义的token，而是SSE的chunk)
-TOKEN_RATE_LIMIT={"/v1/chat/completions":"20/second"}
+TOKEN_RATE_LIMIT={"/v1/chat/completions":"20/second", "/benchmark/v1/chat/completions":"500/second"}
 
 
-TIMEOUT=300
-CHAT_COMPLETION_ROUTE=/v1/chat/completions
+TIMEOUT=100
 
 # 设定时区
 TZ=Asia/Shanghai
diff --git a/Examples/benchmark/.env b/Examples/benchmark/.env
@@ -0,0 +1,48 @@
+# 示例与解释见 .env.example
+
+# `LOG_CHAT`: 是否记录日志
+LOG_CHAT=false
+PRINT_CHAT=false
+
+BENCHMARK_MODE=true
+
+# `OPENAI_BASE_URL`: 转发openai风格的任何服务地址，允许指定多个, 以逗号隔开。
+# 如果指定超过一个，则任何OPENAI_ROUTE_PREFIX/EXTRA_ROUTE_PREFIX都不能为根路由/
+OPENAI_BASE_URL=https://xxx.xxx.xxx
+
+# `OPENAI_ROUTE_PREFIX`: 可指定所有openai风格(为记录日志)服务的转发路由前缀
+OPENAI_ROUTE_PREFIX=
+
+OPENAI_API_KEY=
+FORWARD_KEY=
+
+CHAT_COMPLETION_ROUTE=/benchmark/v1/chat/completions
+
+# `EXTRA_BASE_URL`: 可指定任意服务转发
+EXTRA_BASE_URL=
+# `EXTRA_ROUTE_PREFIX`: 与 EXTRA_BASE_URL 匹配的路由前缀
+EXTRA_ROUTE_PREFIX=
+
+# `REQ_RATE_LIMIT`: i.e. 对指定路由的请求速率限制, 区分用户
+# format: {route: ratelimit-string}
+# ratelimit-string format [count] [per|/] [n (optional)] [second|minute|hour|day|month|year] :ref:`ratelimit-string`: https://limits.readthedocs.io/en/stable/quickstart.html#rate-limit-string-notation
+REQ_RATE_LIMIT={"/healthz":"100/2minutes","/v1/chat/completions":"60/minute;600/hour"}
+
+# `GLOBAL_RATE_LIMIT`: 限制所有`REQ_RATE_LIMIT`没有指定的路由. 不填默认无限制
+GLOBAL_RATE_LIMIT=
+
+#`RATE_LIMIT_STRATEGY` Options: (fixed-window, fixed-window-elastic-expiry, moving-window) :ref: https://limits.readthedocs.io/en/latest/strategies.html
+# `fixed-window`: most memory efficient strategy; `moving-window`:most effective for preventing bursts but higher memory cost.
+RATE_LIMIT_STRATEGY=moving-window
+
+
+# 返回的token速率限制
+TOKEN_RATE_LIMIT={"/v1/chat/completions":"50/second", "/benchmark/v1/chat/completions":"30/second"}
+
+
+TIMEOUT=5
+
+IP_BLACKLIST=
+
+# 设定时区
+TZ=Asia/Shanghai
diff --git a/Examples/benchmark/README.md b/Examples/benchmark/README.md
@@ -0,0 +1,71 @@
+
+
+## 原始benchmark接口测试
+> http://localhost:8080  
+> 该部分测试是为评估`fastapi`本身流式与非流式返回的性能,不涉及转发
+
+启动参数: `BENCHMARK_MODE=true aifd run --workers=n --port 8080` (n=1 or 4)
+
+
+
+### stream == false:
+```bash
+wrk -t8 -c400 -d10s -s post.lua http://localhost:8080/benchmark/v1/chat/completions
+```
+单核:
+
+![img_7.png](img_7.png)
+
+
+4核:
+
+![img_8.png](img_8.png)
+
+
+### stream == true:
+```bash
+wrk -t8 -c100 -d10s -s post.lua http://localhost:8080/benchmark/v1/chat/completions
+```
+从下面的测试中可以发现，对于fastapi的流式返回时的Request/sec并不高，这里其实还应该添加不同流式返回时间的测试,
+目前流式返回的文本在`cache/chat`中, 下面的结果中没有设置`TOKEN_RATE_LIMIT`，可以添加不同的`TOKEN_RATE_LIMIT`进行测试以模拟实际情况。
+
+单核:
+
+![img_10.png](img_10.png)
+
+4核:
+
+![img.png](img.png)
+
+## 转发benchmark接口
+> http://localhost:8000  
+> 该部分评估流式与非流式转发的性能
+
+
+启动参数: `OPENAI_BASE_URL=http://localhost:8080 aifd run --workers=n --port 8000` (n=1 or 4)
+```bash
+wrk -t8 -c100 -d10s -s post.lua http://localhost:8000/benchmark/v1/chat/completions
+```
+
+### stream == false:
+
+**单核**
+
+![img_5.png](img_5.png)
+
+**4核**:(原始与转发两边均4核)  
+
+![img_2.png](img_2.png)
+
+### stream == true:
+
+**单核**: (是的，转发时的结果比原始的还要好,比较迷~)
+
+![img_4.png](img_4.png)
+
+**4核**:(原始与转发两边均4核)
+
+![img_3.png](img_3.png)
+
+
+
diff --git a/Examples/benchmark/img.png b/Examples/benchmark/img.png
diff --git a/Examples/benchmark/img_1.png b/Examples/benchmark/img_1.png
diff --git a/Examples/benchmark/img_10.png b/Examples/benchmark/img_10.png
diff --git a/Examples/benchmark/img_2.png b/Examples/benchmark/img_2.png
diff --git a/Examples/benchmark/img_3.png b/Examples/benchmark/img_3.png
diff --git a/Examples/benchmark/img_4.png b/Examples/benchmark/img_4.png
diff --git a/Examples/benchmark/img_5.png b/Examples/benchmark/img_5.png
diff --git a/Examples/benchmark/img_6.png b/Examples/benchmark/img_6.png
diff --git a/Examples/benchmark/img_7.png b/Examples/benchmark/img_7.png
diff --git a/Examples/benchmark/img_8.png b/Examples/benchmark/img_8.png
diff --git a/Examples/benchmark/img_9.png b/Examples/benchmark/img_9.png
diff --git a/Examples/benchmark/post.lua b/Examples/benchmark/post.lua
@@ -0,0 +1,10 @@
+wrk.method = "POST"
+wrk.headers["Content-Type"] = "application/json"
+wrk.body = '{"stream": false}'
+-- wrk.body   = '{"stream":true}'
+
+-- wrk.timeout = 20000 -- in milliseconds
+
+-- request = function()
+--     return wrk.format(nil, nil, nil, wrk.body)
+-- end
diff --git a/Examples/benchmark/run.py b/Examples/benchmark/run.py
@@ -0,0 +1,67 @@
+import asyncio
+
+import openai
+from rich import print
+from sparrow import MeasureTime, yaml_load  # pip install sparrow-python
+
+config = yaml_load("config.yaml", rel_path=True)
+print(f"{config=}")
+openai.api_base = config["api_base"]
+openai.api_key = config["api_key"]
+
+stream = True
+# stream = False
+
+is_print = False
+
+
+async def run(n):
+    resp = await openai.ChatCompletion.acreate(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "user", "content": '.'},
+        ],
+        stream=stream,
+        request_timeout=60,
+    )
+
+    if stream:
+        first_chunk = await anext(resp)
+        if first_chunk is not None:
+            if is_print:
+                chunk_message = first_chunk['choices'][0]['delta']
+                print(f"{chunk_message['role']}: ")
+            async for chunk in resp:
+                if is_print:
+                    chunk_message = chunk['choices'][0]['delta']
+                    content = chunk_message.get("content", "")
+                    print(content, end="")
+            if is_print:
+                print()
+    else:
+        if is_print:
+            assistant_content = resp.choices[0].message.content
+            print(assistant_content)
+            print(resp.usage)
+
+    print(f"Task {n} completed")
+
+
+async def main():
+    mt = MeasureTime().start()
+    mean = 0
+    epochs = 5
+    for epoch in range(epochs):
+        tasks = []
+        for i in range(10):  # 创建 x个并发任务
+            task = asyncio.create_task(run(i))
+            tasks.append(task)
+
+        mt.start()
+        await asyncio.gather(*tasks)
+        cost = mt.show_interval(f"{epoch=}")
+        mean += cost
+    print(f"mean: {mean / epochs} s")
+
+
+asyncio.run(main())
diff --git a/Examples/chat.py b/Examples/chat.py
@@ -1,13 +1,17 @@
 import openai
 from rich import print
-from sparrow import yaml_load
+from sparrow import yaml_load  # pip install sparrow-python
 
 config = yaml_load("config.yaml", rel_path=True)
 print(f"{config=}")
 openai.api_base = config["api_base"]
 openai.api_key = config["api_key"]
 
 stream = True
+# stream = False
+# debug = True
+debug = False
+
 user_content = """
 用c实现目前已知最快平方根算法
 """
@@ -16,21 +20,29 @@
     model="gpt-3.5-turbo",
     # model="gpt-4",
     messages=[
-        {"role": "user", "content": user_content},
+        {"role": "user", "content": 'hi'},
     ],
     stream=stream,
+    request_timeout=30,
 )
 
 if stream:
-    chunk_message = next(resp)['choices'][0]['delta']
-    print(f"{chunk_message['role']}: ")
-    for chunk in resp:
-        chunk_message = chunk['choices'][0]['delta']
-        content = chunk_message.get("content", "")
-        print(content, end="")
-    print()
+    if debug:
+        for chunk in resp:
+            print(chunk)
+    else:
+        chunk_message = next(resp)['choices'][0]['delta']
+        print(f"{chunk_message['role']}: ")
+        for chunk in resp:
+            chunk_message = chunk['choices'][0]['delta']
+            content = chunk_message.get("content", "")
+            print(content, end="")
+        print()
 else:
-    print(resp.choices)
+    print(resp)
+    assistant_content = resp.choices[0].message.content
+    print(assistant_content)
+    print(resp.usage)
 
 """
 gpt-4:

diff --git a/openai_forward/__init__.py b/openai_forward/__init__.py
@@ -1,5 +1,5 @@
-__version__ = "0.5.2"
+__version__ = "0.5.3"
 
 from dotenv import load_dotenv
 
-load_dotenv(override=False)
+load_dotenv('.env', override=False)
diff --git a/openai_forward/__main__.py b/openai_forward/__main__.py
@@ -7,22 +7,18 @@
 
 class Cli:
     @staticmethod
-    def run(port=8000, workers=1, log_chat=None):
+    def run(port=8000, workers=1):
         """
         Runs the application using the Uvicorn server.
 
         Args:
             port (int): The port number on which to run the server. Default is 8000.
             workers (int): The number of worker processes to run. Default is 1.
-            log_chat (str): whether to log llm chat. Default is None.
 
         Returns:
             None
         """
 
-        if log_chat:
-            os.environ["LOG_CHAT"] = log_chat
-
         if platform.system() == "Windows":
             os.environ["TZ"] = ""