feat: Support more flexible configuration options (#129)

* feat: Support more flexible configuration options * Add gpt4o model * Feat: Now in general mode, model level control is also supported
KenyonY · Jul 20, 2024 · a628a7a · a628a7a
1 parent 2d0c4e8
commit a628a7a
Show file tree

Hide file tree

Showing 19 changed files with 425 additions and 181 deletions.
diff --git a/.env b/.env
@@ -8,6 +8,8 @@ LOG_OPENAI=true
 CACHE_GENERAL=true
 CACHE_OPENAI=true
 
+CHAT_COMPLETION_ROUTE=/v1/chat/completions
+#CUSTOM_GENERAL_ROUTE=/v1/models/gemini-pro
 CACHE_ROUTES=["/v1/chat/completions","/v1/embeddings"]
 
 # `CACHE_BACKEND`: Options (MEMORY, LMDB, LevelDB)
@@ -16,18 +18,21 @@ CACHE_BACKEND=MEMORY
 
 DEFAULT_REQUEST_CACHING_VALUE=false
 
-#BENCHMARK_MODE=true
+BENCHMARK_MODE=true
 
 FORWARD_CONFIG=[{"base_url":"https://api.openai.com","route":"/","type":"openai"}]
 
 #LEVEL_MODELS={"1": ["gpt-4"], "2": ["gpt-3.5-turbo"]}
 #OPENAI_API_KEY={"sk-xxx": [0], "sk-xxx": [1], "sk-xxx": [1,2]}
-#FORWARD_KEY={"fk-0": 0, "fk-1": 1, "fk-2": 2}
+#FORWARD_KEY={"fk-0": 0, "fk-1": 1, "fk-2": 2, "default": 1}
 
 # `REQ_RATE_LIMIT`: i.e., Request rate limit for specified routes, user specific
 # format: {route: ratelimit-string}
 # ratelimit-string format [count] [per|/] [n (optional)] [second|minute|hour|day|month|year] :ref:`ratelimit-string`: https://limits.readthedocs.io/en/stable/quickstart.html#rate-limit-string-notation
-REQ_RATE_LIMIT={"/v1/chat/completions":"100/2minutes","/v1/completions":"60/minute;600/hour"}
+REQ_RATE_LIMIT='{
+"/v1/chat/completions":[{"level":0,"limit":"100/2minutes"}],
+"/v1/completions":[{"level":0,"limit":"60/minute;600/hour"}]
+}'
 
 # Backend for rate limiting: [memory, redis, memcached, ...] :ref: https://limits.readthedocs.io/en/stable/storage.html#
 #REQ_RATE_LIMIT_BACKEND=redis://localhost:6379
@@ -40,7 +45,11 @@ GLOBAL_RATE_LIMIT=200/minute
 RATE_LIMIT_STRATEGY=moving-window
 
 # Rate limit for returned tokens
-TOKEN_RATE_LIMIT={"/v1/chat/completions":"60/second","/v1/completions":"60/second"}
+TOKEN_RATE_LIMIT='{
+"/v1/chat/completions":[{"level":0,"limit":"60/second"}],
+"/v1/completions":[{"level":0,"limit":"60/second"}],
+"/benchmark/v1/chat/completions":[{"level":0,"limit":"20/second"}]
+}'
 
 # TCP connection timeout duration (in seconds)
 TIMEOUT=6
@@ -49,6 +58,9 @@ ITER_CHUNK_TYPE=one-by-one
 #ITER_CHUNK_TYPE=efficiency
 
 #IP_BLACKLIST=
+WEBUI_RESTART_PORT=15555
+WEBUI_LOG_PORT=15556
 
+DEFAULT_STREAM_RESPONSE=true
 # Set timezone
 TZ=Asia/Shanghai
diff --git a/.env.example b/.env.example
@@ -6,7 +6,7 @@ LOG_OPENAI=true
 CACHE_GENERAL=true
 CACHE_OPENAI=true
 
-
+CHAT_COMPLETION_ROUTE=/v1/chat/completions
 # `CACHE_BACKEND`: Options (MEMORY, LMDB, LevelDB)
 CACHE_BACKEND=LMDB
 CACHE_ROOT_PATH_OR_URL=./FLAXKV_DB
@@ -55,5 +55,9 @@ TIMEOUT=10
 ITER_CHUNK_TYPE=efficiency
 #ITER_CHUNK_TYPE=one-by-one
 
+WEBUI_RESTART_PORT=15555
+WEBUI_LOG_PORT=15556
+
+DEFAULT_STREAM_RESPONSE=true
 # Set timezone
 TZ=Asia/Shanghai
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,49 @@
+name: website
+
+# build the documentation whenever there are new commits on main
+on:
+  push:
+    branches:
+      - main
+    # Alternative: only build for tags.
+    # tags:
+    #   - '*'
+
+# security: restrict permissions for CI jobs.
+permissions:
+  contents: read
+
+jobs:
+  # Build the documentation and upload the static HTML files as an artifact.
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      # ADJUST THIS: install all dependencies (including pdoc)
+      - run: pip install -e .
+      # ADJUST THIS: build your documentation into docs/.
+      # We use a custom build script for pdoc itself, ideally you just run `pdoc -o docs/ ...` here.
+      - run: python docs/make.py
+
+      - uses: actions/upload-pages-artifact@v3
+        with:
+          path: docs/
+
+  # Deploy the artifact to GitHub pages.
+  # This is a separate job so that only actions/deploy-pages has the necessary permissions.
+  deploy:
+    needs: build
+    runs-on: ubuntu-latest
+    permissions:
+      pages: write
+      id-token: write
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,6 @@
 .github/release-template.ejs
 .github/workflows/doc.yml
 
-scripts/release.sh
 node_modules
 package-lock.json
 package.json

diff --git a/README.md b/README.md
@@ -163,37 +163,6 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
   )
 ```
 
-<details >
-   <summary> 更多</summary>  
-
-#### 在三方应用中使用
-
-基于开源项目[ChatGPT-Next-Web](https://github.com/Yidadaa/ChatGPT-Next-Web)中接入:   
-替换docker启动命令中的 `BASE_URL`为自己搭建的代理服务地址
-
-```bash 
-docker run -d \
-    -p 3000:3000 \
-    -e OPENAI_API_KEY="sk-******" \
-    -e BASE_URL="https://api.openai-forward.com" \
-    -e CODE="******" \
-    yidadaa/chatgpt-next-web 
-``` 
-
-**Image Generation (DALL-E)**
-
-```bash
-curl --location 'https://api.openai-forward.com/v1/images/generations' \
---header 'Authorization: Bearer sk-******' \
---header 'Content-Type: application/json' \
---data '{
-    "prompt": "A photo of a cat",
-    "n": 1,
-    "size": "512x512"
-}'
-```
-
-</details>
 
 ### 代理本地模型
 
@@ -207,7 +176,15 @@ curl --location 'https://api.openai-forward.com/v1/images/generations' \
 
 (更多)
 
-### 代理其它云端模型
+### 代理任意云端模型
+
+#### 代理[gemini pro](https://ai.google.dev/)
+配置环境变量或 .env 文件如下：
+```env
+FORWARD_CONFIG=[{"base_url":"https://generativelanguage.googleapis.com","route":"/gemini","type":"general"}]
+```
+说明：`aidf run`启动后，即可通过访问 http://localhost:8000/gemini 使用gemini pro。
+
 
 - **场景1:**
   使用通用转发,可对任意来源服务进行转发，

diff --git a/deploy.md b/deploy.md
@@ -46,6 +46,10 @@ pip install openai-forward
 ```bash
 aifd run   
 ```
+或运行webui
+```bash
+aifd run --webui
+```
 服务就搭建完成了。  
 配置见[配置](README.md#配置)
 

diff --git a/openai_forward/__init__.py b/openai_forward/__init__.py
@@ -1,5 +1,20 @@
-__version__ = "0.7.2"
+__version__ = "0.8.0-alpha"
 
 from dotenv import load_dotenv
+from yaml import load
 
+
+def yaml_load(filepath):
+
+    try:
+        from yaml import CLoader as Loader
+    except ImportError:
+        from yaml import Loader
+    with open(filepath, mode='r', encoding="utf-8") as stream:
+        #     stream = stream.read()
+        content = load(stream, Loader=Loader)
+    return content
+
+
+# yaml_load()
 load_dotenv('.env', override=False)
diff --git a/openai_forward/__main__.py b/openai_forward/__main__.py
@@ -66,15 +66,15 @@ def run(self, port=8000, workers=1, webui=False, start_ui=True, ui_port=8001):
 
             import zmq
 
-            mq_port = 15555
-
             os.environ['OPENAI_FORWARD_WEBUI'] = 'true'
 
             context = zmq.Context()
             socket = context.socket(zmq.REP)
-            socket.bind(f"tcp://*:{mq_port}")
+            restart_port = int(os.environ.get('WEBUI_RESTART_PORT', 15555))
+            socket.bind(f"tcp://*:{restart_port}")
             log_socket = context.socket(zmq.ROUTER)
-            log_socket.bind(f"tcp://*:{15556}")
+            log_port = int(os.environ.get("WEBUI_LOG_PORT", 15556))
+            log_socket.bind(f"tcp://*:{log_port}")
             subscriber_info = {}
 
             def mq_worker(log_socket: zmq.Socket):
@@ -227,6 +227,25 @@ def convert(log_folder: str = None, target_path: str = None):
             convert_folder_to_jsonl(log_folder, target_path)
             print(60 * '-')
 
+    @staticmethod
+    def gen_config(dir: str = "."):
+        """
+        Generates a .env file in the specified directory.
+        """
+        from pathlib import Path
+
+        from openai_forward.config.interface import Config
+
+        config = Config()
+        env_dict = config.convert_to_env(set_env=False)
+        dir = Path(dir)
+
+        with open(dir / ".env", "w") as f:
+            env_content = "\n".join(
+                [f"{key}={value}" for key, value in env_dict.items()]
+            )
+            f.write(env_content)
+
 
 def main():
     fire.Fire(Cli)

diff --git a/openai_forward/cache/chat/chat_completions.py b/openai_forward/cache/chat/chat_completions.py
@@ -9,9 +9,13 @@
 from fastapi import Request
 from fastapi.responses import Response, StreamingResponse
 
-from ...decorators import async_random_sleep, async_token_rate_limit
+from ...decorators import (
+    async_random_sleep,
+    async_token_rate_limit_auth_level,
+    random_sleep,
+)
 from ...helper import get_unique_id
-from ...settings import token_interval_conf
+from ...settings import FWD_KEY, token_interval_conf
 from .tokenizer import TIKTOKEN_VALID, count_tokens, encode_as_pieces
 
 
@@ -118,7 +122,7 @@ class ChatCompletionsResponse:
 sentences = cycle(corpus)
 
 
-@async_token_rate_limit(token_interval_conf)
+@async_token_rate_limit_auth_level(token_interval_conf, FWD_KEY)
 async def stream_generate(
     model: str, content: str | None, tool_calls: list | None, request: Request
 ):
@@ -206,7 +210,7 @@ def serialize_delta(
     yield b'data: [DONE]\n\n'
 
 
-@async_token_rate_limit(token_interval_conf)
+@async_token_rate_limit_auth_level(token_interval_conf, FWD_KEY)
 async def stream_generate_efficient(
     model: str, content: str | None, tool_calls: list | None, request: Request
 ):
@@ -292,6 +296,7 @@ def serialize_delta(
     yield b'data: [DONE]\n\n'
 
 
+@random_sleep(min_time=1, max_time=2)
 def generate(model: str, content: str | None, tool_calls: list | None, usage: dict):
     created = int(time.time())
     id = f"chatcmpl-{get_unique_id()}"
@@ -336,7 +341,6 @@ def model_inference(model: str, messages: List):
     return ModelInferResult(content=sentence, usage=usage)
 
 
-@async_random_sleep(min_time=0, max_time=1)
 async def chat_completions_benchmark(request: Request):
     payload = await request.json()
     model = payload.get("model", 'robot')

diff --git a/openai_forward/cache/chat/response.py b/openai_forward/cache/chat/response.py
@@ -7,10 +7,10 @@
 from flaxkv.pack import encode
 from loguru import logger
 
-from ...settings import CACHE_OPENAI
+from ...settings import CACHE_OPENAI, FWD_KEY
 from ..database import db_dict
 from .chat_completions import (
-    async_token_rate_limit,
+    async_token_rate_limit_auth_level,
     generate,
     stream_generate_efficient,
     token_interval_conf,
@@ -116,7 +116,7 @@ def get_cached_chat_response(payload_info, valid_payload, request, **kwargs):
     return None, cache_key
 
 
-@async_token_rate_limit(token_interval_conf)
+@async_token_rate_limit_auth_level(token_interval_conf, FWD_KEY)
 async def stream_generate(buffer_list: List, request):
     for buffer in buffer_list:
         yield buffer

diff --git a/openai_forward/config/__init__.py b/openai_forward/config/__init__.py