From da4b3dbab81e9a003ae88d76749c0b74e9133086 Mon Sep 17 00:00:00 2001 From: xiezipeng05 Date: Sat, 9 Nov 2024 14:55:39 +0800 Subject: [PATCH] =?UTF-8?q?docs:=20=E5=A2=9E=E5=8A=A0'=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E6=A0=87=E8=AE=B0'=E6=A0=87=E9=A2=98=EF=BC=8C=E5=B9=B6?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0'=E6=96=87=E6=9C=AC=E6=A0=87=E8=AE=B0?= =?UTF-8?q?=E7=AB=A0-=E5=88=A4=E5=86=B3=E4=B9=A6=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E6=A0=87=E8=AE=B0'=E7=AB=A0=E8=8A=82,=E4=BF=AE=E6=94=B9'=5Ftoc?= =?UTF-8?q?.yml'=E6=96=87=E4=BB=B6=E4=BB=A5=E5=AE=B9=E7=BA=B3=E6=96=B0?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../1_example_section/1_example_article.md | 6 - .../1_example_section/1_example_article2.md | 6 - 1_example_chapter/1_example_section/README.md | 1 - 1_example_chapter/2_example_article3.md | 6 - 1_example_chapter/README.md | 1 - .../1_example_section2/README.md | 1 - 2_example_chapter2/README.md | 1 - 3_example_chapter3/README.md | 1 - _config.yml | 6 +- _toc.yml | 18 +-- text_cutting/README.md | 1 + text_marking/README.md | 1 + text_marking/a.ipynb | 115 ++++++++++++++++++ 13 files changed, 123 insertions(+), 41 deletions(-) delete mode 100644 1_example_chapter/1_example_section/1_example_article.md delete mode 100644 1_example_chapter/1_example_section/1_example_article2.md delete mode 100644 1_example_chapter/1_example_section/README.md delete mode 100644 1_example_chapter/2_example_article3.md delete mode 100644 1_example_chapter/README.md delete mode 100644 2_example_chapter2/1_example_section2/README.md delete mode 100644 2_example_chapter2/README.md delete mode 100644 3_example_chapter3/README.md create mode 100644 text_cutting/README.md create mode 100644 text_marking/README.md create mode 100644 text_marking/a.ipynb diff --git a/1_example_chapter/1_example_section/1_example_article.md b/1_example_chapter/1_example_section/1_example_article.md deleted file mode 100644 index 9233c38..0000000 --- a/1_example_chapter/1_example_section/1_example_article.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -level: introductory -stage: alpha ---- - -# 测试文章 diff --git a/1_example_chapter/1_example_section/1_example_article2.md b/1_example_chapter/1_example_section/1_example_article2.md deleted file mode 100644 index db8928f..0000000 --- a/1_example_chapter/1_example_section/1_example_article2.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -level: introductory -stage: alpha ---- - -# 测试文章2 diff --git a/1_example_chapter/1_example_section/README.md b/1_example_chapter/1_example_section/README.md deleted file mode 100644 index d3ed8ec..0000000 --- a/1_example_chapter/1_example_section/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例节1 diff --git a/1_example_chapter/2_example_article3.md b/1_example_chapter/2_example_article3.md deleted file mode 100644 index 336f212..0000000 --- a/1_example_chapter/2_example_article3.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -level: introductory -stage: alpha ---- - -# 测试文章3 diff --git a/1_example_chapter/README.md b/1_example_chapter/README.md deleted file mode 100644 index d68340c..0000000 --- a/1_example_chapter/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例章 \ No newline at end of file diff --git a/2_example_chapter2/1_example_section2/README.md b/2_example_chapter2/1_example_section2/README.md deleted file mode 100644 index 89a285f..0000000 --- a/2_example_chapter2/1_example_section2/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例节2 diff --git a/2_example_chapter2/README.md b/2_example_chapter2/README.md deleted file mode 100644 index 1825e6a..0000000 --- a/2_example_chapter2/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例章2 \ No newline at end of file diff --git a/3_example_chapter3/README.md b/3_example_chapter3/README.md deleted file mode 100644 index 1825e6a..0000000 --- a/3_example_chapter3/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例章2 \ No newline at end of file diff --git a/_config.yml b/_config.yml index a255055..58aa142 100644 --- a/_config.yml +++ b/_config.yml @@ -1,7 +1,7 @@ # https://jupyterbook.org/en/stable/customize/config.html -name: quanttide-example-of-documentation -title: 量潮示例文档项目 +name: quanttide-usercase-of-data-engineering +title: 量潮数据工程用例库 author: 量潮科技 -description: 量潮文档项目实例 +description: 量潮数据工程用例库 # Jupyter Book Config only_build_toc_files: true \ No newline at end of file diff --git a/_toc.yml b/_toc.yml index 6d0c126..35074ac 100644 --- a/_toc.yml +++ b/_toc.yml @@ -1,18 +1,6 @@ format: jb-book root: index.md parts: - - caption: 示例部分 - chapters: - - file: 1_example_chapter/README.md - sections: - - file: 1_example_chapter/1_example_section/README.md - sections: - - file: 1_example_chapter/1_example_section/1_example_article.md - - file: 1_example_chapter/1_example_section/1_example_article2.md - - file: 1_example_chapter/2_example_article3.md - - caption: 示例部分2 - chapters: - - file: 2_example_chapter2/README.md - sections: - - file: 2_example_chapter2/1_example_section2/README.md - - file: 3_example_chapter3/README.md \ No newline at end of file + - caption: 文本标记 + chapters: + - file: text_marking/a.ipynb diff --git a/text_cutting/README.md b/text_cutting/README.md new file mode 100644 index 0000000..77e77cc --- /dev/null +++ b/text_cutting/README.md @@ -0,0 +1 @@ +# 文本切割 diff --git a/text_marking/README.md b/text_marking/README.md new file mode 100644 index 0000000..b73709a --- /dev/null +++ b/text_marking/README.md @@ -0,0 +1 @@ +# 文本切割 \ No newline at end of file diff --git a/text_marking/a.ipynb b/text_marking/a.ipynb new file mode 100644 index 0000000..bf1a0af --- /dev/null +++ b/text_marking/a.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 判决书文本标记\n", + "\n", + "## 背景\n", + "\n", + "从若干份案件判决书的全文中,逐份提取肇事人、事故发生地、伤亡数量等二十余项信息。\n", + "\n", + "## 方案\n", + " \n", + "使用大模型进行文本标记,并用function_call功能来获取结构化数据。\n", + "\n", + "### 问题\n", + "\n", + "判决书全文过长,直接丢给大模型会让其标记效果不好,如“总罚款金额”会识别成“赔偿金”、“单项罚款金额”等错误数据。\n", + "\n", + "#### 解决方案1\n", + "\n", + "正文切割成段落或句子,从这些段落和句子中摘要成半结构化数据。\n", + "\n", + "#### 解决方案2\n", + "\n", + "1. 使用大模型读取整篇文本和20个字段的说明,为每个问题字段给出合适文本 \n", + " 例如:\n", + " relevant_text:dict[str,list[str]] = {\n", + " \"案件类型\":[\n", + " \"对应文本1\",\n", + " \"对应文本2\"\n", + " ]\n", + " “肇事人”:[\n", + " \"对应文本1\"\n", + " ]\n", + " }\n", + " 提示词为:将原始文本分段,让它从20个字段中选出来最相关的字段标记这段文本\n", + " \n", + "2. 将问题字段与对应文本发送给大模型,大模型给出合适结果\n", + "\n", + "注:这个方案是RAG方案的简化版,RAG方案所做的同样是找到20个字段各自对应的文本,但它(找文本这一步的)实现方式是通过向量搜索(通过词嵌入等方式找到最对应文本段)完成的,然后发送给大模型整合答案。RAG之所以这样做是因为被搜索的文本数量很多,大模型上下文长度不够,所以只好用其他方式代替。但在此处,被搜索的文本长度是可以被直接读的,所以用大模型代替RAG部分,找到对应的文本,再次拿着这个文本去询问大模型(文本越短,结果越准)。\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "此处代码无法正常运行,后续会有补充,只是作为示例进行演示,最终版本为可运行的核心代码片段\n", + "\"\"\"\n", + "# # 解决方案2的代码\n", + "\n", + "# ## 步骤1: 大模型读取文本和字段说明,为每个问题字段给出合适文本\n", + "# tools = [\n", + "# {\n", + "# \"type\":\"function\",\n", + "# \"function\": {\n", + "# \"name\": \"associate_field_with_text\",\n", + "# \"description\": \"将与文本有关联的字段保存\",\n", + "# \"parameters\": {\n", + "# \"type\": \"object\",\n", + "# \"properties\": {\n", + "# \"field\":{\n", + "# \"type\": \"list[string]\",\n", + "# \"description\": \"与所给文本有关联的若干个字段,例如['文书类型','肇事人']等,只给出字段名称即可\",\n", + "# },\n", + "# \"log\":{\n", + "# \"type\":\"string\",\n", + "# \"description\":\"日志,简要给出你的标记依据,你给出这些字段的原因\"\n", + "# }\n", + "# },\n", + "# },\n", + "# \"required\":[\n", + "# \"field\",\n", + "# \"text\",\n", + "# ]\n", + "# }\n", + "# }\n", + "# ]\n", + "\n", + "# def get_response(self,messages,tools):\n", + "# response = Generation.call(\n", + "# model='qwen-max',\n", + "# messages=messages,\n", + "# tools=tools,\n", + "# result_format='message'\n", + "# )\n", + "# return response\n", + "\n", + "# ## 步骤2: 将问题字段与对应文本发送给大模型,大模型给出合适结果\n", + "\n", + "# question_filed = '案件类型'\n", + "# chat ( question_filed, relevant_text [ question_filed ] ,prompt )\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}