From 65968dbe86f0a18373822ea40e8bcc98c495d03b Mon Sep 17 00:00:00 2001 From: xiezipeng05 Date: Sat, 9 Nov 2024 06:56:20 +0000 Subject: [PATCH] deploy: da4b3dbab81e9a003ae88d76749c0b74e9133086 --- .buildinfo | 2 +- .../1_example_section/1_example_article.html | 434 ----------------- .../1_example_section/1_example_article2.html | 434 ----------------- .../1_example_section/README.html | 436 ------------------ 1_example_chapter/2_example_article3.html | 434 ----------------- .../1_example_section2/README.html | 434 ----------------- 2_example_chapter2/README.html | 436 ------------------ 3_example_chapter3/README.html | 424 ----------------- .../1_example_section/1_example_article.md | 6 - .../1_example_section/1_example_article2.md | 6 - .../1_example_section/README.md | 1 - .../1_example_chapter/2_example_article3.md | 6 - _sources/1_example_chapter/README.md | 1 - .../1_example_section2/README.md | 1 - _sources/2_example_chapter2/README.md | 1 - _sources/3_example_chapter3/README.md | 1 - _sources/text_marking/a.ipynb | 115 +++++ genindex.html | 24 +- index.html | 32 +- objects.inv | Bin 397 -> 284 bytes search.html | 24 +- searchindex.js | 2 +- .../README.html => text_marking/a.html | 182 ++++++-- 23 files changed, 276 insertions(+), 3160 deletions(-) delete mode 100644 1_example_chapter/1_example_section/1_example_article.html delete mode 100644 1_example_chapter/1_example_section/1_example_article2.html delete mode 100644 1_example_chapter/1_example_section/README.html delete mode 100644 1_example_chapter/2_example_article3.html delete mode 100644 2_example_chapter2/1_example_section2/README.html delete mode 100644 2_example_chapter2/README.html delete mode 100644 3_example_chapter3/README.html delete mode 100644 _sources/1_example_chapter/1_example_section/1_example_article.md delete mode 100644 _sources/1_example_chapter/1_example_section/1_example_article2.md delete mode 100644 _sources/1_example_chapter/1_example_section/README.md delete mode 100644 _sources/1_example_chapter/2_example_article3.md delete mode 100644 _sources/1_example_chapter/README.md delete mode 100644 _sources/2_example_chapter2/1_example_section2/README.md delete mode 100644 _sources/2_example_chapter2/README.md delete mode 100644 _sources/3_example_chapter3/README.md create mode 100644 _sources/text_marking/a.ipynb rename 1_example_chapter/README.html => text_marking/a.html (58%) diff --git a/.buildinfo b/.buildinfo index 6fb0360..3b3cfbf 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: e0aa15031a7249adff4f519866eab383 +config: 24ec50a7acb4811d3a1940eddcb2ff06 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/1_example_chapter/1_example_section/1_example_article.html b/1_example_chapter/1_example_section/1_example_article.html deleted file mode 100644 index 67e7324..0000000 --- a/1_example_chapter/1_example_section/1_example_article.html +++ /dev/null @@ -1,434 +0,0 @@ - - - - - - - - - - - 测试文章 — 量潮示例文档项目 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

测试文章

- -
-
- -
-
-
- - - - -
- -
-

测试文章#

-
- - - - -
- - - - - - - - -
- - - - -
-
- - -
- - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/1_example_chapter/1_example_section/1_example_article2.html b/1_example_chapter/1_example_section/1_example_article2.html deleted file mode 100644 index 99fa049..0000000 --- a/1_example_chapter/1_example_section/1_example_article2.html +++ /dev/null @@ -1,434 +0,0 @@ - - - - - - - - - - - 测试文章2 — 量潮示例文档项目 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

测试文章2

- -
-
- -
-
-
- - - - -
- -
-

测试文章2#

-
- - - - -
- - - - - - - - -
- - - - -
-
- - -
- - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/1_example_chapter/1_example_section/README.html b/1_example_chapter/1_example_section/README.html deleted file mode 100644 index 83e453d..0000000 --- a/1_example_chapter/1_example_section/README.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - - - - - - - 示例节1 — 量潮示例文档项目 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

示例节1

- -
-
- -
-
-
- - - - -
- -
-

示例节1#

-
-
-
- - - - -
- - - - - - - - -
- - - - -
-
- - -
- - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/1_example_chapter/2_example_article3.html b/1_example_chapter/2_example_article3.html deleted file mode 100644 index 6186c3b..0000000 --- a/1_example_chapter/2_example_article3.html +++ /dev/null @@ -1,434 +0,0 @@ - - - - - - - - - - - 测试文章3 — 量潮示例文档项目 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

测试文章3

- -
-
- -
-
-
- - - - -
- -
-

测试文章3#

-
- - - - -
- - - - - - - - -
- - - - -
-
- - -
- - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/2_example_chapter2/1_example_section2/README.html b/2_example_chapter2/1_example_section2/README.html deleted file mode 100644 index 48d6279..0000000 --- a/2_example_chapter2/1_example_section2/README.html +++ /dev/null @@ -1,434 +0,0 @@ - - - - - - - - - - - 示例节2 — 量潮示例文档项目 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

示例节2

- -
-
- -
-
-
- - - - -
- -
-

示例节2#

-
- - - - -
- - - - - - - - -
- - - - -
-
- - -
- - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/2_example_chapter2/README.html b/2_example_chapter2/README.html deleted file mode 100644 index 35a5039..0000000 --- a/2_example_chapter2/README.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - - - - - - - 示例章2 — 量潮示例文档项目 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

示例章2

- -
-
- -
-
-
- - - - -
- -
-

示例章2#

-
-
-
- - - - -
- - - - - - - - -
- - - - -
-
- - -
- - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/3_example_chapter3/README.html b/3_example_chapter3/README.html deleted file mode 100644 index b356de1..0000000 --- a/3_example_chapter3/README.html +++ /dev/null @@ -1,424 +0,0 @@ - - - - - - - - - - - 示例章2 — 量潮示例文档项目 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

示例章2

- -
-
- -
-
-
- - - - -
- -
-

示例章2#

-
- - - - -
- - - - - - - - -
- - - - -
-
- - -
- - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/_sources/1_example_chapter/1_example_section/1_example_article.md b/_sources/1_example_chapter/1_example_section/1_example_article.md deleted file mode 100644 index 9233c38..0000000 --- a/_sources/1_example_chapter/1_example_section/1_example_article.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -level: introductory -stage: alpha ---- - -# 测试文章 diff --git a/_sources/1_example_chapter/1_example_section/1_example_article2.md b/_sources/1_example_chapter/1_example_section/1_example_article2.md deleted file mode 100644 index db8928f..0000000 --- a/_sources/1_example_chapter/1_example_section/1_example_article2.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -level: introductory -stage: alpha ---- - -# 测试文章2 diff --git a/_sources/1_example_chapter/1_example_section/README.md b/_sources/1_example_chapter/1_example_section/README.md deleted file mode 100644 index d3ed8ec..0000000 --- a/_sources/1_example_chapter/1_example_section/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例节1 diff --git a/_sources/1_example_chapter/2_example_article3.md b/_sources/1_example_chapter/2_example_article3.md deleted file mode 100644 index 336f212..0000000 --- a/_sources/1_example_chapter/2_example_article3.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -level: introductory -stage: alpha ---- - -# 测试文章3 diff --git a/_sources/1_example_chapter/README.md b/_sources/1_example_chapter/README.md deleted file mode 100644 index d68340c..0000000 --- a/_sources/1_example_chapter/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例章 \ No newline at end of file diff --git a/_sources/2_example_chapter2/1_example_section2/README.md b/_sources/2_example_chapter2/1_example_section2/README.md deleted file mode 100644 index 89a285f..0000000 --- a/_sources/2_example_chapter2/1_example_section2/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例节2 diff --git a/_sources/2_example_chapter2/README.md b/_sources/2_example_chapter2/README.md deleted file mode 100644 index 1825e6a..0000000 --- a/_sources/2_example_chapter2/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例章2 \ No newline at end of file diff --git a/_sources/3_example_chapter3/README.md b/_sources/3_example_chapter3/README.md deleted file mode 100644 index 1825e6a..0000000 --- a/_sources/3_example_chapter3/README.md +++ /dev/null @@ -1 +0,0 @@ -# 示例章2 \ No newline at end of file diff --git a/_sources/text_marking/a.ipynb b/_sources/text_marking/a.ipynb new file mode 100644 index 0000000..bf1a0af --- /dev/null +++ b/_sources/text_marking/a.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 判决书文本标记\n", + "\n", + "## 背景\n", + "\n", + "从若干份案件判决书的全文中,逐份提取肇事人、事故发生地、伤亡数量等二十余项信息。\n", + "\n", + "## 方案\n", + " \n", + "使用大模型进行文本标记,并用function_call功能来获取结构化数据。\n", + "\n", + "### 问题\n", + "\n", + "判决书全文过长,直接丢给大模型会让其标记效果不好,如“总罚款金额”会识别成“赔偿金”、“单项罚款金额”等错误数据。\n", + "\n", + "#### 解决方案1\n", + "\n", + "正文切割成段落或句子,从这些段落和句子中摘要成半结构化数据。\n", + "\n", + "#### 解决方案2\n", + "\n", + "1. 使用大模型读取整篇文本和20个字段的说明,为每个问题字段给出合适文本 \n", + " 例如:\n", + " relevant_text:dict[str,list[str]] = {\n", + " \"案件类型\":[\n", + " \"对应文本1\",\n", + " \"对应文本2\"\n", + " ]\n", + " “肇事人”:[\n", + " \"对应文本1\"\n", + " ]\n", + " }\n", + " 提示词为:将原始文本分段,让它从20个字段中选出来最相关的字段标记这段文本\n", + " \n", + "2. 将问题字段与对应文本发送给大模型,大模型给出合适结果\n", + "\n", + "注:这个方案是RAG方案的简化版,RAG方案所做的同样是找到20个字段各自对应的文本,但它(找文本这一步的)实现方式是通过向量搜索(通过词嵌入等方式找到最对应文本段)完成的,然后发送给大模型整合答案。RAG之所以这样做是因为被搜索的文本数量很多,大模型上下文长度不够,所以只好用其他方式代替。但在此处,被搜索的文本长度是可以被直接读的,所以用大模型代替RAG部分,找到对应的文本,再次拿着这个文本去询问大模型(文本越短,结果越准)。\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "此处代码无法正常运行,后续会有补充,只是作为示例进行演示,最终版本为可运行的核心代码片段\n", + "\"\"\"\n", + "# # 解决方案2的代码\n", + "\n", + "# ## 步骤1: 大模型读取文本和字段说明,为每个问题字段给出合适文本\n", + "# tools = [\n", + "# {\n", + "# \"type\":\"function\",\n", + "# \"function\": {\n", + "# \"name\": \"associate_field_with_text\",\n", + "# \"description\": \"将与文本有关联的字段保存\",\n", + "# \"parameters\": {\n", + "# \"type\": \"object\",\n", + "# \"properties\": {\n", + "# \"field\":{\n", + "# \"type\": \"list[string]\",\n", + "# \"description\": \"与所给文本有关联的若干个字段,例如['文书类型','肇事人']等,只给出字段名称即可\",\n", + "# },\n", + "# \"log\":{\n", + "# \"type\":\"string\",\n", + "# \"description\":\"日志,简要给出你的标记依据,你给出这些字段的原因\"\n", + "# }\n", + "# },\n", + "# },\n", + "# \"required\":[\n", + "# \"field\",\n", + "# \"text\",\n", + "# ]\n", + "# }\n", + "# }\n", + "# ]\n", + "\n", + "# def get_response(self,messages,tools):\n", + "# response = Generation.call(\n", + "# model='qwen-max',\n", + "# messages=messages,\n", + "# tools=tools,\n", + "# result_format='message'\n", + "# )\n", + "# return response\n", + "\n", + "# ## 步骤2: 将问题字段与对应文本发送给大模型,大模型给出合适结果\n", + "\n", + "# question_filed = '案件类型'\n", + "# chat ( question_filed, relevant_text [ question_filed ] ,prompt )\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/genindex.html b/genindex.html index f812db4..8267e49 100644 --- a/genindex.html +++ b/genindex.html @@ -7,7 +7,7 @@ - Index — 量潮示例文档项目 + Index — 量潮数据工程用例库 @@ -141,7 +141,7 @@ -

量潮示例文档项目

+

量潮数据工程用例库

diff --git a/index.html b/index.html index 94eeb74..898f022 100644 --- a/index.html +++ b/index.html @@ -8,7 +8,7 @@ - 简介 — 量潮示例文档项目 + 简介 — 量潮数据工程用例库 @@ -63,7 +63,7 @@ - + @@ -143,7 +143,7 @@ -

量潮示例文档项目

+

量潮数据工程用例库

@@ -328,8 +312,6 @@

简介

简介#

-
-
- + - @@ -142,7 +141,7 @@ -

量潮示例文档项目

+

量潮数据工程用例库

@@ -235,7 +218,7 @@ -
  • -.md +.ipynb
  • @@ -298,7 +281,9 @@ - + @@ -309,11 +294,27 @@
    -

    示例章

    +

    判决书文本标记

    +
    +

    Contents

    +
    +
    @@ -324,9 +325,99 @@

    示例章

    -

    示例章#

    -
    +

    判决书文本标记#

    +
    +

    背景#

    +

    从若干份案件判决书的全文中,逐份提取肇事人、事故发生地、伤亡数量等二十余项信息。

    +
    +
    +

    方案#

    +

    使用大模型进行文本标记,并用function_call功能来获取结构化数据。

    +
    +

    问题#

    +

    判决书全文过长,直接丢给大模型会让其标记效果不好,如“总罚款金额”会识别成“赔偿金”、“单项罚款金额”等错误数据。

    +
    +

    解决方案1#

    +

    正文切割成段落或句子,从这些段落和句子中摘要成半结构化数据。

    +
    +
    +

    解决方案2#

    +
      +
    1. 使用大模型读取整篇文本和20个字段的说明,为每个问题字段给出合适文本 +例如: +relevant_text:dict[str,list[str]] = { +“案件类型”:[ +“对应文本1”, +“对应文本2” +] +“肇事人”:[ +“对应文本1” +] +} +提示词为:将原始文本分段,让它从20个字段中选出来最相关的字段标记这段文本

    2. +
    3. 将问题字段与对应文本发送给大模型,大模型给出合适结果

    4. +
    +

    注:这个方案是RAG方案的简化版,RAG方案所做的同样是找到20个字段各自对应的文本,但它(找文本这一步的)实现方式是通过向量搜索(通过词嵌入等方式找到最对应文本段)完成的,然后发送给大模型整合答案。RAG之所以这样做是因为被搜索的文本数量很多,大模型上下文长度不够,所以只好用其他方式代替。但在此处,被搜索的文本长度是可以被直接读的,所以用大模型代替RAG部分,找到对应的文本,再次拿着这个文本去询问大模型(文本越短,结果越准)。

    +
    +
    +
    """
    +此处代码无法正常运行,后续会有补充,只是作为示例进行演示,最终版本为可运行的核心代码片段
    +"""
    +# # 解决方案2的代码
    +
    +# ## 步骤1: 大模型读取文本和字段说明,为每个问题字段给出合适文本
    +# tools = [
    +#         {
    +#             "type":"function",
    +#             "function": {
    +#                 "name": "associate_field_with_text",
    +#                 "description": "将与文本有关联的字段保存",
    +#                 "parameters": {
    +#                     "type": "object",
    +#                     "properties": {
    +#                         "field":{
    +#                             "type": "list[string]",
    +#                             "description": "与所给文本有关联的若干个字段,例如['文书类型','肇事人']等,只给出字段名称即可",
    +#                         },
    +#                         "log":{
    +#                             "type":"string",
    +#                             "description":"日志,简要给出你的标记依据,你给出这些字段的原因"
    +#                         }
    +#                     },
    +#                 },
    +#                 "required":[
    +#                     "field",
    +#                     "text",
    +#                 ]
    +#             }
    +#         }
    +#     ]
    +
    +# def get_response(self,messages,tools):
    +#     response = Generation.call(
    +#         model='qwen-max',
    +#         messages=messages,
    +#         tools=tools,
    +#         result_format='message'
    +#     )
    +#     return response
    +
    +# ## 步骤2: 将问题字段与对应文本发送给大模型,大模型给出合适结果
    +
    +# question_filed = '案件类型'
    +# chat ( question_filed, relevant_text [ question_filed ] ,prompt )
    +
    +
    +
    +
    +
    '\n此处代码无法正常运行,后续会有补充,只是作为示例进行演示,最终版本为可运行的核心代码片段\n'
    +
    +
    +
    +
    +
    +