baidubce · MrChengmo · Oct 26, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/appbuilder/__init__.py b/appbuilder/__init__.py
@@ -113,7 +113,7 @@ def get_default_header():
 from .core.components.handwrite_ocr.component import HandwriteOCR
 from .core.components.image_understand.component import ImageUnderstand
 from .core.components.mix_card_ocr.component import MixCardOCR
-
+from .core.components.document_understanding.component import DocumentUnderstanding
 __COMPONENTS__ = [
     "RagWithBaiduSearchPro",
     "RAGWithBaiduSearch",
@@ -163,6 +163,7 @@ def get_default_header():
     "HandwriteOCR",
     "ImageUnderstand",
     "MixCardOCR",
+    "DocumentUnderstanding",
 ] # NOQA
 
 from appbuilder.core.message import Message

diff --git a/appbuilder/core/components/document_understanding/README.md b/appbuilder/core/components/document_understanding/README.md
@@ -0,0 +1,96 @@
+# 长文档内容理解（DocumentUnderstanding）
+
+## 简介
+长文档内容理解组件（DocumentUnderstanding）支持对图片以及文档内容进行理解，并基于图片以及文档内容对用户的提问进行回答，
+包括但不限于文档内容问答、总结摘要、内容分析。
+### 功能介绍
+根据用户上传的文档（支持txt、docx、pdf、xlsx、png、jpg、jpeg等多种格式）、query、指令生成大模型答案
+### 特色优势
+处理长上下文的大模型内容理解任务
+### 应用场景
+长上下文的文档问答
+
+## 基本用法
+### 快速开始
+
+```python
+
+import os
+import appbuilder
+
+# 请前往千帆AppBuilder官网创建密钥，流程详见：https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5
+APPBUILDER_TOKEN = "YOUR-TOKEN"
+os.environ["APPBUILDER_TOKEN"] = APPBUILDER_TOKEN
+du = appbuilder.DocumentUnderstanding()
+query = appbuilder.Message("这篇文档讲了什么")
+instruction = "请根据文档内容回答问题，用一句话简短概括"
+addition_instruction = "用一句话简短概括" ##用户增强指令，可选填，该内容会进一步增强大模型的指令跟随能力，将你最需要增强效果的指令填于此，内容可以与上述的"instruction"基础指令有重复，注意：该字段内容过多会一定程度影响大模型内容严谨度，请注意控制该字段的指令字数
+app_id = "YOUR-APP-ID" ##你需要在系统上自己的账号下（https://qianfan.cloud.baidu.com/appbuilder）创建任意空Agent，并获取该Agent的app_id（即界面上的应用ID，在首页->个人空间->应用 里面即会显示应用ID），这里任意空Agent就可以，无需任何配置信息，这个agent的作用只是为了获取app_id信息
+file_path = "YOUR-FILE-PATH" ##填写你的本地待分析文件路径
+stream = False ##是否开启流式输出功能
+response_ = du.run(query, 
+                   file_path, 
+                   instruction=instruction, 
+                   addition_instruction=addition_instruction, 
+                   app_id=app_id,
+                   stream=stream)
+
+for result in response_:
+    print(result) ##打印输出的大模型答案
+```
+
+
+## 参数说明
+### 鉴权说明
+使用组件之前，请首先申请并设置鉴权参数，可参考[组件使用流程](https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5)。
+```python
+# 设置环境中的TOKEN，以下示例略
+import os
+os.environ['APPBUILDER_TOKEN'] = 'bce-YOURTOKEN'
+```
+
+
+### 初始化参数
+
+| 参数名称 | 参数类型 | 是否必须 | 描述 | 示例值 |
+| ------- | ------- | -------- | -------- | -------- |
+| `secret_key` | str | 否 | 用户鉴权token，默认从环境变量中获取: `os.getenv("APPBUILDER_TOKEN", "")` | bce-v3/XXX |
+| `gateway` | str | 否 | 后端网关服务地址，默认从环境变量中获取: `os.getenv("GATEWAY_URL", "")` | https://appbuilder.baidu.com |
+| `lazy_certification` | bool | 否 | 延迟认证，为True时在第一次运行时认证。默认为False。 | False |
+
+
+### 调用参数
+
+| 参数名称                   | 参数类型 | 是否必须 | 描述                                                                       | 示例值                         |
+|------------------------|------|------|--------------------------------------------------------------------------|-----------------------------|
+| `message`              | obj  | 是    | 输入消息，用户输入query。                                                          | Message(content=input_data) |
+| `file_path`            | str  | 是    | 用户需要分析的文档                                                                | "test.pdf"                  |
+| `app_id`               | str  | 是    | 你需要在系统上自己的账号下（https://qianfan.cloud.baidu.com/appbuilder）创建任意空Agent，并获取该Agent的app_id（即界面上的应用ID，在首页->个人空间->应用 里面即会显示应用ID），这里任意空Agent就可以，无需任何配置信息，这个agent的作用只是为了获取app_id信息 | "YOUR-APP-ID"               |
+| `instruction`          | str  | 否    | 用户指令                                                                     | "你的回答要严谨客观，且答案一定要分点阐述"      |
+| `addition_instruction` | str  | 否    | 用户增强指令，可选填，该内容会进一步增强大模型的指令跟随能力，将你最需要增强效果的指令填于此，注意：该字段内容过多会一定程度影响大模型内容严谨度 | "你的答案需要分点阐述"                |
+
+### 响应参数
+| 参数名称 | 参数类型 | 描述 | 示例值 |
+| ------- |------| -------- | -------- |
+| `result` | str  | 模型运行后的输出结果 | "" |
+
+### 响应示例-流式输出
+```
+data: {"type": "text", "text": "文件解析完成, 耗时13485.63ms\n\n"} request_id: f99a7230-649f-4170-ade7-62d8368a18e6
+data: {"type": "text", "text": "**Human", "event_status": "running"} request_id: f99a7230-649f-4170-ade7-62d8368a18e6
+data: {"type": "text", "text": "-Timescale Adaptation in an Open-Ended Task Space** 文档详细介绍了DeepMind团队开发的自适应代理（Adaptive Agent，简称", "event_status": "running"} request_id: f99a7230-649f-4170-ade7-62d8368a18e6
+data: {"type": "text", "text": "AdA）在开放任务空间中的快速适应能力。", "event_status": "running"} request_id: f99a7230-649f-4170-ade7-62d8368a18e6
+data: {"type": "text", "text": "", "event_status": "done"} request_id: f99a7230-649f-4170-ade7-62d8368a18e6
+```
+
+### 响应示例-非流式输出
+```
+{'code': 0, 'message': '', 'result': {'text': '文件解析完成, 耗时14572.57ms\n\n**Human-Timescale Adaptation in an Open-Ended Task Space** 文档详细介绍了DeepMind团队开发的自适应代理（Adaptive Agent，简称AdA）在开放任务空间中的快速适应能力。以下是文档的主要内容和贡献点：\n\n1. **引言**：\n   - 强调了快速适应能力对于人工智能的重要性，特别是在现实世界中的应用和与人类互动的场景中。\n   - 提出了通过元强化学习（meta-RL）和自动课程学习（auto-curriculum learning）等方法，训练能够在未见过的环境中快速适应的代理。\n\n2. **自适应代理（AdA）**：\n   - 介绍了AdA的设计和训练方法，包括其在开放任务空间中的适应行为、记忆架构、以及如何通过自动课程学习来优化训练过程。\n   - 展示了AdA能够在几分钟内解决复杂的3D任务，且不需要进一步的代理训练，显示了其快速适应的能力。\n\n3. **实验与结果**：\n   - 在多个方面评估了AdA的性能，包括其在单代理和多代理设置下的适应能力、不同架构和课程学习方法的影响、以及模型大小和记忆长度对性能的影响。\n   - 通过与人类玩家的比较，证明了AdA在适应速度上与人类相当。\n\n4. **相关工作**：\n   - 回顾了与本工作相关的领域，包括程序化环境生成、开放任务学习、适应性和强化学习中的Transformer应用等。\n\n5. **结论**：\n   - 总结了AdA的贡献，强调了其在开放任务空间中快速适应的能力，以及通过元强化学习和自动课程学习等方法训练大型模型的可能性。\n\n6. **作者和贡献**：\n   - 列出了主要贡献者和部分贡献者，以及项目的赞助商和认可。\n\n**主要贡献点**：\n- 提出了AdA，一个能够在开放任务空间中快速适应的代理，其适应速度与人类相当。\n- 通过元强化学习和自动课程学习等方法，训练了大型Transformer模型，展示了其在开放任务空间中的快速适应能力。\n- 分析了不同架构、课程学习方法、模型大小和记忆长度对AdA性能的影响，提供了详细的实验结果和比较。\n- 通过与人类玩家的比较，证明了AdA在适应速度上的优势。'}, 'request_id': '687642b0-b877-49ed-9ad9-65d76de0ea58'}
+```
+
+## 高级用法
+
+## 更新记录和贡献
+### 2024.10. 15
+#### [Added]
+- 第一版
diff --git a/appbuilder/core/components/document_understanding/__init__.py b/appbuilder/core/components/document_understanding/__init__.py
diff --git a/appbuilder/core/components/document_understanding/base.py b/appbuilder/core/components/document_understanding/base.py
@@ -0,0 +1,37 @@
+"""
+Copyright (c) 2023 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+from pydantic import Field
+from appbuilder.core.message import Message
+from appbuilder.core.component import ComponentArguments
+
+
+
+class DocumentUnderstandingArgs(ComponentArguments):
+    '''长文档问答配置'''
+    message: Message = Field(...,
+                         variable_name="query",
+                         description="用户输入query")
+    file_path: str = Field(...,
+                           variable_name="file_path",
+                             description="用户上传的文件路径")
+    instruction: str = Field(default="",
+                           variable_name='instruction',
+                           description='用户指令')
+    addition_instruction: str = Field(default="",
+                                      variable_name='addition_instruction',
+                                      description='用户增强指令')
diff --git a/appbuilder/core/components/document_understanding/component.py b/appbuilder/core/components/document_understanding/component.py
@@ -0,0 +1,215 @@
+"""
+Copyright (c) 2023 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import requests
+
+import json
+from typing import Optional
+from appbuilder.core.components.document_understanding.base import DocumentUnderstandingArgs
+
+from appbuilder.core.message import Message
+from appbuilder.core.component import Component
+import base64
+import uuid
+
+class DocumentUnderstanding(Component):
+    """
+    DocumentUnderstanding
+    """
+    name = "document_understanding"
+    version = "v1"
+    meta = DocumentUnderstandingArgs
+    manifests = [{
+        "name": "document_understanding",
+        "description": "该工具支持对图片以及文档内容进行理解，并基于图片以及文档内容对用户的提问进行回答，包括但不限于文档内容问答、"
+                       "总结摘要、内容分析。",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "query": {
+                    "type": "string",
+                    "description": "用户输入的query"
+                },
+                "file_path": {
+                    "type": "string",
+                    "description": "用户上传的文档的文件路径"
+                },
+                "instruction": {
+                    "type": "string",
+                    "description": "用户指令"
+                },
+                "addition_instruction": {
+                    "type": "string",
+                    "description": "用户增强指令"
+                },
+                "app_id": {
+                    "type": "string",
+                    "description": "系统应用ID"
+                },
+            },
+            "required": ["query", "file_path", "instruction", "addition_instruction", "app_id"]
+        }
+    }]
+    def __init__(
+            self,
+            secret_key: Optional[str] = None,
+            gateway: str = "",
+            lazy_certification: bool = False,
+            instruction: Optional[Message] = None,
+            addition_instruction: Optional[Message] = None,
+            file_path: Optional[str] = None,
+            app_id: Optional[str] = None,
+
+    ):
+        """初始化DocumentUnderstanding组件。
+
+        Args:
+            secret_key (str, 可选): 用户鉴权token, 默认从环境变量中获取: os.getenv("APPBUILDER_TOKEN", "").
+            gateway (str, 可选): 后端网关服务地址，默认从环境变量中获取: os.getenv("GATEWAY_URL", "")
+            lazy_certification (bool, 可选): 延迟认证，为True时在第一次运行时认证. Defaults to False.
+
+        Returns:
+            None
+        """
+        super().__init__(DocumentUnderstandingArgs,
+                         secret_key=secret_key,
+                         gateway=gateway,
+                         lazy_certification=lazy_certification)
+        self.instruction = instruction,
+        self.addition_instruction = addition_instruction
+        self.file_path = file_path
+        self.app_id = app_id
+
+
+    def get_addition_instruction(self, addition_instruction: str):
+        """拼接addition_instruction"""
+        return "，指令：" + addition_instruction
+
+
+    def get_conversation_id(self, app_id: str):
+        url = "https://qianfan.baidubce.com/v2/app/conversation"
+        payload = json.dumps({
+            "app_id": app_id
+        })
+        headers = {
+            'Content-Type': 'application/json',
+            'X-Appbuilder-Authorization': f"Bearer {os.getenv('APPBUILDER_TOKEN', '')}"
+        }
+        response = self.http_client.session.post(url, headers=headers, data=payload)
+        self.http_client.check_response_header(response)
+        response = requests.request("POST", url, headers=headers, data=payload)
+        return json.loads(response.text).get("conversation_id", None)
+
+    def get_file_id(self, conversation_id: str, app_id: str, file_path: str):
+        url = "https://qianfan.baidubce.com/v2/app/conversation/file/upload"
+        payload = {
+            'app_id': app_id,
+            'conversation_id': conversation_id
+        }
+        files = [
+            ('file', (file_path,
+                      open(file_path, 'rb'), 'application/{}'.format(file_path.split("."[-1]))))
+        ]
+        headers = {
+            'X-Appbuilder-Authorization': f"Bearer {os.getenv('APPBUILDER_TOKEN', '')}"
+        }
+        response = requests.request("POST", url, headers=headers, data=payload, files=files)
+        return json.loads(response.text).get("id", None)
+
+    def run(self,
+            message: Message,
+            file_path,
+            instruction="",
+            addition_instruction="",
+            app_id="",
+            stream=False,
+            timeout=None):
+        '''
+        run方法，用于执行长文档理解任务
+        Args:
+            message: 用户输入query
+            file_path: 用户输入的文件路径
+            instruction: 用户输入的人设指令
+            addition_instruction: 用户输入的增强版指令(如有)
+            app_id: 用户输入的app_id
+
+        Returns:
+            result (Message): 模型运行后的输出消息。
+
+        '''
+        file_name = file_path.split("/")[-1]
+        file_type = file_name.split(".")[-1].lower()
+        request_id = str(uuid.uuid4())
+        support_file_type = ["pdf", "docx", "xlsx", "png", "jpg", "jpeg", "txt"]
+        if file_type not in support_file_type:
+            raise Exception(f"不支持解析{file_type}类型的文件，当前仅支持解析以下几种文件类型：{support_file_type}")
+        payload = json.dumps({
+            "stream": stream,
+            "batch": False,
+            "arguments": {
+            "query": message.content,
+            "file_ids":[self.get_file_id(self.get_conversation_id(app_id=app_id), app_id, file_path)],
+            "files": [],
+            "file_urls": {},
+            "instruction": instruction,
+            "addition_instruction": self.get_addition_instruction(addition_instruction),
+            }
+        })
+        headers = self.http_client.auth_header()
+        headers['Content-Type'] = 'application/json'
+        headers['Authorization'] = f"Bearer {os.getenv('APPBUILDER_TOKEN', '')}"
+        headers['X-Appbuilder-Request-Id'] = request_id
+        url = self.http_client.service_url_v2("/components/document_understanding/version/preview")
+        response = self.http_client.session.post(url, headers=headers, data=payload, timeout=timeout, stream=stream)
+        self.http_client.check_response_header(response)
+        if response.status_code == 200:
+            if stream:
+                # 处理流式响应，逐行生成数据
+                for line in response.iter_lines():
+                    if line:
+                        decoded_line = line.decode('utf-8')
+                        decoded_line = f"{decoded_line} request_id: {request_id}"
+                        yield decoded_line  # 使用yield逐行输出结果
+            else:
+                result = response.json()
+                result["request_id"] = request_id
+                if result["code"] == 0:
+                    yield result
+
+                else:
+                    raise Exception(f"服务请求失败: {result['message']}")
+        else:
+            response.raise_for_status()
+
+    def tool_eval(self,
+                  message: Message,
+                  file_path: str,
+                  stream: bool = False,
+                  **kwargs):
+        """用于function call
+        """
+        instruction = kwargs.get("instruction", "")
+        addition_instruction = kwargs.get("addition_instruction", "")
+        app_id = kwargs.get("app_id", "")
+
+        result = self.run(message,
+                          file_path,
+                          instruction=instruction,
+                          addition_instruction=addition_instruction,
+                          app_id=app_id,
+                          stream=stream)
+        return result
diff --git a/appbuilder/tests/component_collector.py b/appbuilder/tests/component_collector.py
@@ -67,7 +67,8 @@
     "PlantRecognition",
     "HandwriteOCR",
     "ImageUnderstand",
-    "MixCardOCR", 
+    "MixCardOCR",
+    "DocumentUnderstanding",
 ]