add document understanding module

baidubce · Oct 16, 2024 · fe6781a · fe6781a
1 parent a348d88
commit fe6781a
Show file tree

Hide file tree

Showing 6 changed files with 450 additions and 1 deletion.
diff --git a/appbuilder/__init__.py b/appbuilder/__init__.py
@@ -113,7 +113,7 @@ def get_default_header():
 from .core.components.handwrite_ocr.component import HandwriteOCR
 from .core.components.image_understand.component import ImageUnderstand
 from .core.components.mix_card_ocr.component import MixCardOCR
-
+from .core.components.document_understanding.component import DocumentUnderstanding
 __COMPONENTS__ = [
     "RagWithBaiduSearchPro",
     "RAGWithBaiduSearch",
@@ -163,6 +163,7 @@ def get_default_header():
     "HandwriteOCR",
     "ImageUnderstand",
     "MixCardOCR",
+    "DocumentUnderstanding",
 ] # NOQA
 
 from appbuilder.core.message import Message

diff --git a/appbuilder/core/components/document_understanding/README.md b/appbuilder/core/components/document_understanding/README.md
@@ -0,0 +1,100 @@
+# 文件生成PPT（PPTGenerationFromFile）
+
+## 简介
+长文档内容理解组件（DocumentUnderstanding）支持对图片以及文档内容进行理解，并基于图片以及文档内容对用户的提问进行回答，
+包括但不限于文档内容问答、总结摘要、内容分析。
+### 功能介绍
+根据用户上传的文档（支持txt、docx、pdf、xlsx、png、jpg、jpeg等多种格式）、query、指令生成大模型答案
+### 特色优势
+处理长上下文的大模型内容理解任务
+### 应用场景
+长上下文的文档问答
+
+## 基本用法
+### 快速开始
+
+```python
+
+import uuid
+import os
+import appbuilder
+
+# 请前往千帆AppBuilder官网创建密钥，流程详见：https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5
+APPBUILDER_TOKEN = "YOUR-TOKEN"
+os.environ["APPBUILDER_TOKEN"] = APPBUILDER_TOKEN
+uid = str(uuid.uuid4())
+trace_id = str(uuid.uuid4()) 
+conversation_id = str(uuid.uuid4()) ## 注意你的conversation_id不能和之前的请求重复，不然会直接返回之前已有的conversation_id的答案
+du = appbuilder.DocumentUnderstanding()
+query = appbuilder.Message("这篇文档讲了什么")
+instruction = "请根据文档内容回答问题"
+addition_instruction = "请你用一句话简短概括" ##用户增强指令，可选填，该内容会进一步增强大模型的指令跟随能力，将你最需要增强效果的指令填于此，内容可以与上述的"instruction"基础指令有重复，注意：该字段内容过多会一定程度影响大模型内容严谨度，请注意控制该字段的指令字数
+file_path = "YOUR-FILE-PATH" ##填写你的本地待分析文件路径
+stream = False ##是否开启流式输出功能
+response_ = du.run(query, 
+                   file_path, 
+                   instruction=instruction, 
+                   addition_instruction=addition_instruction, 
+                   uid=uid,
+                   trace_id=trace_id, 
+                   conversation_id=conversation_id, 
+                   stream=stream)
+
+for result in response_:
+    print(result) ##打印输出的大模型答案
+```
+
+
+## 参数说明
+### 鉴权说明
+使用组件之前，请首先申请并设置鉴权参数，可参考[组件使用流程](https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5)。
+```python
+# 设置环境中的TOKEN，以下示例略
+import os
+os.environ['APPBUILDER_TOKEN'] = 'bce-YOURTOKEN'
+```
+
+
+### 初始化参数
+
+| 参数名称 | 参数类型 | 是否必须 | 描述 | 示例值 |
+| ------- | ------- | -------- | -------- | -------- |
+| `secret_key` | str | 否 | 用户鉴权token，默认从环境变量中获取: `os.getenv("APPBUILDER_TOKEN", "")` | bce-v3/XXX |
+| `gateway` | str | 否 | 后端网关服务地址，默认从环境变量中获取: `os.getenv("GATEWAY_URL", "")` | https://appbuilder.baidu.com |
+| `lazy_certification` | bool | 否 | 延迟认证，为True时在第一次运行时认证。默认为False。 | False |
+
+
+### 调用参数
+
+| 参数名称                   | 参数类型 | 是否必须 | 描述                                                                       | 示例值             |
+|------------------------|------|------|--------------------------------------------------------------------------|-----------------|
+| `message`              | obj  | 是    | 输入消息，用户输入query。                                                          | Message(content=input_data) |
+| `file_path`            | str  | 是    | 用户需要分析的文档                                                                | "test.pdf"      |
+| `instruction`          | str  | 否    | 用户指令                                                                     | "你的回答要严谨客观"     |
+| `addition_instruction` | str  | 否    | 用户增强指令，可选填，该内容会进一步增强大模型的指令跟随能力，将你最需要增强效果的指令填于此，注意：该字段内容过多会一定程度影响大模型内容严谨度 | "你的答案需要分点阐述"    |
+
+
+### 响应参数
+| 参数名称 | 参数类型 | 描述 | 示例值 |
+| ------- |------| -------- | -------- |
+| `result` | str  | 模型运行后的输出结果 | "" |
+
+### 响应示例
+```
+您好，请问您是想询问关于残疾人办理什么证件的问题吗？如果是，我可以为您提供一些信息。
+
+首先，如果您是首次申请办理残疾人证，需要携带身份证、户口簿和三张两寸近期免冠白底彩色照片到县残联办证窗口提出申请。如果您因身体原因无法亲自前往，可以联系村（社区）工作人员代办申请。
+
+其次，如果您是指残疾类型等级证明，您需要携带相关材料到指定医院或医生进行评级，并由医生签名盖章。
+
+最后，如果您是指残疾人享受低保或残疾人贫困证的一级肢体、视力、智力、精神、多重及60周岁以上的一级听力、语言的重度残疾人可以享受重度残疾人生活补助，那么您需要携带身份证、户口本和残疾证申请表到县、市、区级残联进行办理。
+
+希望这些信息对您有所帮助。如果您还有其他问题，欢迎随时提问。
+```
+
+## 高级用法
+
+## 更新记录和贡献
+### 2024.10. 15
+#### [Added]
+- 第一版
diff --git a/appbuilder/core/components/document_understanding/__init__.py b/appbuilder/core/components/document_understanding/__init__.py
diff --git a/appbuilder/core/components/document_understanding/base.py b/appbuilder/core/components/document_understanding/base.py
@@ -0,0 +1,60 @@
+"""
+Copyright (c) 2023 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+from pydantic import Field
+from appbuilder.core.message import Message
+import base64
+from appbuilder.core.component import ComponentArguments
+
+
+
+class DocumentUnderstandingArgs(ComponentArguments):
+    '''长文档问答配置'''
+    message: Message = Field(...,
+                         variable_name="query",
+                         description="用户输入query")
+    file_path: str = Field(...,
+                           variable_name="file_path",
+                             description="用户上传的文件路径")
+    instruction: str = Field(default="",
+                           variable_name='instruction',
+                           description='用户指令')
+    addition_instruction: str = Field(default="",
+                                      variable_name='addition_instruction',
+                                      description='用户增强指令')
+
+    def file_to_base64(self, file_path):
+        """
+        读取指定路径的文件并将其内容转换为Base64编码的字符串。
+
+        :param file_path: 文件的本地路径
+        :return: Base64编码的字符串
+        """
+        try:
+            # 以二进制模式读取文件内容
+            with open(file_path, "rb") as file:
+                file_data = file.read()
+
+            # 将文件内容编码为Base64
+            base64_encoded_data = base64.b64encode(file_data).decode('utf-8')
+
+            return base64_encoded_data
+
+        except Exception as e:
+            print(f"读取文件或编码过程中出错: {e}")
+            return None
+
diff --git a/appbuilder/core/components/document_understanding/component.py b/appbuilder/core/components/document_understanding/component.py
@@ -0,0 +1,215 @@
+"""
+Copyright (c) 2023 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import requests
+
+import json
+from typing import Optional
+
+from appbuilder.core.components.document_understanding.base import DocumentUnderstandingArgs
+
+from appbuilder.core.message import Message
+from appbuilder.core.component import Component
+import base64
+
+class DocumentUnderstanding(Component):
+    """
+    DocumentUnderstanding
+    """
+    name = "document_understanding"
+    version = "v1"
+    meta = DocumentUnderstandingArgs
+    manifests = [{
+        "name": "document_understanding",
+        "description": "该工具支持对图片以及文档内容进行理解，并基于图片以及文档内容对用户的提问进行回答，包括但不限于文档内容问答、"
+                       "总结摘要、内容分析。",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "query": {
+                    "type": "string",
+                    "description": "用户输入的query"
+                },
+                "file_path": {
+                    "type": "string",
+                    "description": "用户上传的文档的文件路径"
+                },
+                "instruction": {
+                    "type": "string",
+                    "description": "用户指令"
+                },
+                "addition_instruction": {
+                    "type": "string",
+                    "description": "用户增强指令"
+                },
+            },
+            "required": ["query", "file_path", "instruction", "addition_instruction"]
+        }
+    }]
+
+    def __init__(
+            self,
+            secret_key: Optional[str] = None,
+            gateway: str = "",
+            lazy_certification: bool = False,
+            instruction: Optional[Message] = None,
+            addition_instruction: Optional[Message] = None,
+            file_path: Optional[str] = None,
+
+    ):
+        """初始化DocumentUnderstanding组件。
+
+        Args:
+            secret_key (str, 可选): 用户鉴权token, 默认从环境变量中获取: os.getenv("APPBUILDER_TOKEN", "").
+            gateway (str, 可选): 后端网关服务地址，默认从环境变量中获取: os.getenv("GATEWAY_URL", "")
+            lazy_certification (bool, 可选): 延迟认证，为True时在第一次运行时认证. Defaults to False.
+
+        Returns:
+            None
+        """
+        super().__init__(DocumentUnderstandingArgs,
+                         secret_key=secret_key,
+                         gateway=gateway,
+                         lazy_certification=lazy_certification)
+        self.url = "http://copilot-test.now.baidu-int.com/dte/api/v2/component/tool_eval"
+        self.instruction = instruction,
+        self.addition_instruction = addition_instruction
+        self.file_path = file_path
+
+
+    def get_addition_instruction(self, addition_instruction: str):
+        """拼接addition_instruction"""
+        return "，指令：" + addition_instruction
+
+    def file_to_base64(self, file_path: str):
+        try:
+            # 打开文件并读取内容
+            with open(file_path, "rb") as file:
+                file_data = file.read()
+
+            # 将文件数据转换为base64格式
+            base64_encoded_data = base64.b64encode(file_data)
+
+            # 将base64字节数据转换为字符串
+            base64_message = base64_encoded_data.decode('utf-8')
+
+            return base64_message
+
+        except FileNotFoundError:
+            return "文件未找到，请检查文件路径是否正确。"
+        except Exception as e:
+            return f"发生错误: {str(e)}"
+
+    def run(self,
+            message: Message,
+            file_path,
+            instruction="",
+            addition_instruction="",
+            uid=None,
+            trace_id=None,
+            conversation_id=None,
+            stream=False):
+        '''
+        run方法，用于执行长文档理解任务
+        Args:
+            message: 用户输入query
+            file_path: 用户输入的文件路径
+            instruction: 用户输入的人设指令
+            addition_instruction: 用户输入的增强版指令(如有)
+
+        Returns:
+            result (Message): 模型运行后的输出消息。
+
+        '''
+        file_data = self.file_to_base64(file_path)
+        file_name = file_path.split("/")[-1]
+        file_type = file_name.split(".")[-1].lower()
+
+        support_file_type = ["pdf", "docx", "xlsx", "png", "jpg", "jpeg", "txt"]
+        if file_type not in support_file_type:
+            raise Exception(f"不支持解析{file_type}类型的文件，当前仅支持解析以下几种文件类型：{support_file_type}")
+        payload = json.dumps({
+            "component": "DocumentUnderstanding",
+            "stream": stream,
+            "component_init_args": {
+                "instruction": instruction,
+                "addition_instruction": self.get_addition_instruction(addition_instruction),
+                "file_data": file_data,
+                "file_name": file_name,
+            },
+            "system": {
+                "uid": uid,
+                "traceid": trace_id,
+                "conversation_id": conversation_id,
+                "appbuilder_token": f"Bearer {os.getenv('APPBUILDER_TOKEN', '')}"
+            },
+            "user": {
+                "query": message.content
+            }
+        })
+        headers = {
+            'content-type': 'application/json',
+            'user-agent': 'vscode-restclient',
+            'x-appbuilder-authorization': f"Bearer {os.getenv('APPBUILDER_TOKEN', '')}",
+            'x-appbuilder-from': 'sdk'
+        }
+
+        # try:
+        response = requests.request("POST", self.url, headers=headers, data=payload, stream=True)
+        if response.status_code == 200:
+            if stream:
+                # 处理流式响应，逐行生成数据
+                for line in response.iter_lines():
+                    if line:
+                        decoded_line = line.decode('utf-8')
+                        yield decoded_line  # 使用yield逐行输出结果
+            else:
+                result = response.json()
+                if result["code"] == 0:
+                    yield result["result"].get("text")
+                else:
+                    raise Exception(f"服务请求失败: {result['message']}")
+        else:
+            response.raise_for_status()
+
+
+if __name__ == '__main__':
+    import uuid
+    APPBUILDER_TOKEN = "YOUR-TOKEN"
+    os.environ["APPBUILDER_TOKEN"] = APPBUILDER_TOKEN
+    import appbuilder
+    uid = str(uuid.uuid4())
+    trace_id = str(uuid.uuid4())
+    conversation_id = str(uuid.uuid4())
+    du = DocumentUnderstanding()
+    query = appbuilder.Message("这篇文档讲了什么")
+    instruction = "请根据文档内容回答问题"
+    addition_instruction = "请你用一句话简短概括"
+    file_path = "test.docx"
+    stream = True
+    print(trace_id)
+    response_ = du.run(query,
+                       file_path,
+                       instruction=instruction,
+                       addition_instruction=addition_instruction,
+                       uid=uid,
+                       trace_id=trace_id,
+                       conversation_id=conversation_id,
+                       stream=stream)
+
+    for result in response_:
+        print(result)