Merge pull request #59 from gomate-community/pipeline

Pipeline
gomate-community · Sep 10, 2024 · 3c687fa · 3c687fa
2 parents ef86e74 + 0a6612f
commit 3c687fa
Showing 6 changed files with 111 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -8,14 +8,17 @@
 [![pydocstyle](https://img.shields.io/badge/pydocstyle-enabled-AD4CD3)](http://www.pydocstyle.org/en/stable/)
 [![PEP8](https://img.shields.io/badge/code%20style-pep8-orange.svg)](https://www.python.org/dev/peps/pep-0008/)
 
-
 ## 🔥Gomate 简介
-GoMate是一款配置化模块化的Retrieval-Augmented Generation (RAG) 框架，旨在提供**可靠的输入与可信的输出**，确保用户在检索问答场景中能够获得高质量且可信赖的结果。
+
+GoMate是一款配置化模块化的Retrieval-Augmented Generation (RAG) 框架，旨在提供**可靠的输入与可信的输出**
+，确保用户在检索问答场景中能够获得高质量且可信赖的结果。
 
 GoMate框架的设计核心在于其**高度的可配置性和模块化**，使得用户可以根据具体需求灵活调整和优化各个组件，以满足各种应用场景的要求。
 
 ## 🔨Gomate框架
+
 ![framework.png](resources%2Fframework.png)
+
 ## ✨主要特色
 
 **“Reliable input,Trusted output”**
@@ -24,18 +27,22 @@ GoMate框架的设计核心在于其**高度的可配置性和模块化**，使
 
 ## 🏗️ 更新记录
 
-- 添加[MinerU文档解析](https://github.com/gomate-community/GoMate/blob/main/docs/mineru.md)：一站式开源高质量数据提取工具，支持PDF/网页/多格式电子书提取`[20240907] `
+- 添加[MinerU文档解析](https://github.com/gomate-community/GoMate/blob/main/docs/mineru.md)
+  ：一站式开源高质量数据提取工具，支持PDF/网页/多格式电子书提取`[20240907] `
 - RAPTOR:递归树检索器实现
 - 支持多种文件解析并且模块化目前支持解析的文件类型包括：`text`,`docx`,`ppt`,`excel`,`html`,`pdf`,`md`等
 - 优化了`DenseRetriever`，支持索引构建，增量追加以及索引保存，保存内容包括文档、向量以及索引
 - 添加`ReRank`的BGE排序、Rewriter的`HyDE`
 - 添加`Judge`的BgeJudge,判断文章是否有用 `20240711`
+
 ## 🚀快速上手
 
 ### 安装环境
+
 ```shell
 pip install -r requirements.txt
 ```
+
 ### 1 文档解析
 
 目前支持解析的文件类型包括：`text`,`docx`,`ppt`,`excel`,`html`,`pdf`,`md`
@@ -74,13 +81,12 @@ print(data.columns)
 retriever.build_from_texts(documents)
 ```
 
-
 保存索引
+
 ```python
 retriever.save_index()
 ```
 
-
 ### 3 检索文档
 
 ```python
@@ -89,8 +95,10 @@ print(result)
 ```
 
 ### 4 大模型问答
+
 ```python
 from gomate.modules.generator.llm import GLMChat
+
 chat = GLMChat(path='THUDM/chatglm3-6b')
 print(chat.chat(question, [], content))
 ```
@@ -119,7 +127,6 @@ from gomate.modules.reranker.bge_reranker import BgeReranker
 from gomate.modules.retrieval.dense_retriever import DenseRetriever
 
 
-
 class RagApplication():
     def __init__(self, config):
         pass
@@ -139,9 +146,10 @@ class RagApplication():
 
 模块可见[rag.py](gomate/applications/rag.py)
 
-
 ### 🌐体验RAG效果
+
 可以配置本地模型路径
+
 ```text
 # 修改成自己的配置！！！
 app_config = ApplicationConfig()
@@ -166,17 +174,29 @@ application.init_vector_store()
 ```shell
 python app.py
 ```
+
 浏览器访问：[127.0.0.1:7860](127.0.0.1:7860)
 ![demo.png](resources%2Fdemo.png)
 
 app后台日志：
 
 ![app_logging.png](resources%2Fapp_logging.png)
+
 ## ⭐️ Star History
 
 [![Star History Chart](https://api.star-history.com/svg?repos=gomate-community/GoMate&type=Date)](https://star-history.com/#gomate-community/GoMate&Date)
 
 ## 研究与开发团队
+
 本项目由网络数据科学与技术重点实验室[`GoMate`](https://github.com/gomate-community)团队完成，团队指导老师为郭嘉丰、范意兴研究员。
 
-GoMate技术交流群可添加：1185918903（微信）
+## 技术交流群
+
+欢迎多提建议、Bad cases，欢迎进群及时交流，也欢迎大家多提PR</br>
+
+<img src="https://github.com/gomate-community/GoMate/blob/pipeline/resources/wechat.png" width="180px" height="270px">
+
+
+群满或者合作交流可以联系：
+
+<img src="https://raw.githubusercontent.com/yanqiangmiffy/Chinese-LangChain/master/images/personal.jpg" width="180px">
diff --git a/docs/rag.md b/docs/rag.md
@@ -20,18 +20,18 @@
 
 #### 3.提交结果
 
-| 序号 | 方法                                           | 分数    |
-|----|----------------------------------------------|-------|
-| 0  | baseline:glm4_plus                           | 0.34  |
-| 1  | bm25s                                        | 0.06091 |
-| 2  | bm25s+修改prompt                               | 0.26175 |
-| 3  | bm25s+hybrid检索器+qwen27b                      | 0.22371 |
-| 4  | bm25s+hybrid检索器+qwen21.5b                    | 0.23608 |
-| 5  | dense+qwen21.5b                              | 0.24613 |
-| 6  | hybrid检索器                                    | 0.05696 |
-| 7  | hybrid检索器+qwen7b +xunfei prompt              | 0.33623 |
-| 8  | hybrid检索器+top10+qwen7b +xunfei prompt        | 0.32735 |
-| 9  | hybrid检索器+top5+glm4-9b +glm4 prompt          | 0.37147 |
-| 10 | hybrid检索器+top5+glm4-9b +xunfei prompt        | 0.41775 |
-| 11 | hybrid检索器+top5+glm4-9b +xunfei prompt+【无法回答】 | 0.3878 |
-| 11 | hybrid检索器+top5+glm4-9b +xunfei prompt+rerank |   |
+| 序号 | 方法                                             | 分数      |
+|----|------------------------------------------------|---------|
+| 0  | baseline:glm4_plus                             | 0.34    |
+| 1  | bm25s                                          | 0.06091 |
+| 2  | bm25s+修改prompt                                 | 0.26175 |
+| 3  | bm25s+hybrid检索器+qwen27b                        | 0.22371 |
+| 4  | bm25s+hybrid检索器+qwen21.5b                      | 0.23608 |
+| 5  | dense+qwen21.5b                                | 0.24613 |
+| 6  | hybrid检索器                                      | 0.05696 |
+| 7  | hybrid检索器+qwen7b +xunfei prompt                | 0.33623 |
+| 8  | hybrid检索器+top10+qwen7b +xunfei prompt          | 0.32735 |
+| 9  | hybrid检索器+top5+glm4-9b +glm prompt             | 0.37147 |
+| 10 | hybrid检索器+top5+glm4-9b +xunfei prompt          | 0.41775 |
+| 11 | hybrid检索器+top5+glm4-9b +xunfei prompt+【无法回答】   | 0.3878  |
+| 12 | hybrid检索器+top5+glm4-9b +qwen prompt+rerank :失误 | 0.27884 |
diff --git a/examples/rag/eda.py b/examples/rag/eda.py
@@ -0,0 +1,49 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+from collections import Counter
+import jieba
+
+# 读取数据
+news = []
+with open('../../data/competitions/xunfei/corpus.txt', 'r', encoding='utf-8') as f:
+    for line in f.readlines():
+        news.append(line.strip())
+
+df = pd.DataFrame({'news': news})
+
+# 1. 查看数据的基本信息
+print("数据的基本信息：")
+print(df.info())
+print("\n数据的统计摘要：")
+print(df.describe())
+
+# 2. 新闻长度分析
+df['news_length'] = df['news'].apply(len)
+
+# 可视化新闻长度分布
+plt.figure(figsize=(10, 6))
+plt.hist(df['news_length'], bins=50, color='skyblue')
+plt.title('new length')
+plt.xlabel('length of chars')
+plt.ylabel('nums of news')
+plt.show()
+
+# 3. 词频分析
+# 使用jieba分词
+df['news_words'] = df['news'].apply(lambda x: jieba.lcut(x))
+
+# 统计词频
+all_words = [word for words in df['news_words'] for word in words if len(word.strip())>2]
+word_counts = Counter(all_words)
+
+# 查看最常见的10个词
+common_words = word_counts.most_common(20)
+print("\n新闻中最常见的10个词：")
+for word, count in common_words:
+    print(f"{word}: {count} 次")
+
+# 4. 假设新闻来源是含有"记者"等关键词的新闻行
+df['source'] = df['news'].apply(lambda x: '记者' in x or '本报讯' in x)
+
+# 查看有新闻来源的条数
+print("\n含有新闻来源的条数：", df['source'].sum())
diff --git a/examples/rag/xunfei_rag.py b/examples/rag/xunfei_rag.py
@@ -7,7 +7,7 @@
 from gomate.modules.document.txt_parser import TextParser
 from gomate.modules.document.utils import PROJECT_BASE
 from gomate.modules.generator.llm import GLM4Chat
-from gomate.modules.reranker.bge_reranker import BgeReranker, BgeRerankerConfig
+from gomate.modules.reranker.bge_reranker import BgeRerankerConfig, BgeReranker
 from gomate.modules.retrieval.bm25s_retriever import BM25RetrieverConfig
 from gomate.modules.retrieval.dense_retriever import DenseRetrieverConfig
 from gomate.modules.retrieval.hybrid_retriever import HybridRetriever, HybridRetrieverConfig
@@ -36,12 +36,15 @@ def generate_chunks():
     test_path = "/data/users/searchgpt/yq/GoMate_dev/data/competitions/xunfei/test_question.csv"
     embedding_model_path = "/data/users/searchgpt/pretrained_models/bge-large-zh-v1.5"
     llm_model_path = "/data/users/searchgpt/pretrained_models/glm-4-9b-chat"
-
+    # ====================文件解析+切片=========================
+    # generate_chunks()
     with open(f'{PROJECT_BASE}/output/chunks.pkl', 'rb') as f:
         chunks = pickle.load(f)
     corpus = []
     for chunk in chunks:
         corpus.extend(chunk)
+
+    # ====================检索器配置=========================
     # BM25 and Dense Retriever configurations
     bm25_config = BM25RetrieverConfig(
         method='lucene',
@@ -51,42 +54,47 @@ def generate_chunks():
     )
     bm25_config.validate()
     print(bm25_config.log_config())
-
     dense_config = DenseRetrieverConfig(
         model_name_or_path=embedding_model_path,
         dim=1024,
         index_path='indexs/dense_cache'
     )
     config_info = dense_config.log_config()
     print(config_info)
-
     # Hybrid Retriever configuration
+    # 由于分数框架不在同一维度，建议可以合并
     hybrid_config = HybridRetrieverConfig(
         bm25_config=bm25_config,
         dense_config=dense_config,
-        bm25_weight=0.7,
-        dense_weight=0.3
+        bm25_weight=0.7,  # bm25检索结果权重
+        dense_weight=0.3  # dense检索结果权重
     )
     hybrid_retriever = HybridRetriever(config=hybrid_config)
+    # 构建索引
     # hybrid_retriever.build_from_texts(corpus)
+    # 保存索引
     # hybrid_retriever.save_index()
+    # 加载索引
     hybrid_retriever.load_index()
-    # Query
+
+    # ====================检索测试=========================
     query = "新冠肺炎疫情"
     results = hybrid_retriever.retrieve(query, top_k=5)
-
     # Output results
     for result in results:
         print(f"Text: {result['text']}, Score: {result['score']}")
 
+    # ====================排序配置=========================
     reranker_config = BgeRerankerConfig(
         model_name_or_path="/data/users/searchgpt/pretrained_models/bge-reranker-large"
     )
     bge_reranker = BgeReranker(reranker_config)
 
+    # ====================生成器配置=========================
     # qwen_chat = QwenChat(llm_model_path)
     glm4_chat = GLM4Chat(llm_model_path)
 
+    # ====================检索问答=========================
     test = pd.read_csv(test_path)
     answers = []
     for question in tqdm(test['question'], total=len(test)):
@@ -95,7 +103,7 @@ def generate_chunks():
             query=question,
             documents=[doc['text'] for idx, doc in enumerate(search_docs)]
         )
-        print(search_docs)
+        # print(search_docs)
         content = '/n'.join([f'信息[{idx}]：' + doc['text'] for idx, doc in enumerate(search_docs)])
         answer = glm4_chat.chat(prompt=question, content=content)
         answers.append(answer[0])
@@ -104,4 +112,4 @@ def generate_chunks():
         print("************************************/n")
     test['answer'] = answers
 
-    test[['answer']].to_csv(f'{PROJECT_BASE}/output/bm25_v2.csv', index=False)
+    test[['answer']].to_csv(f'{PROJECT_BASE}/output/gomate_baseline.csv', index=False)
diff --git a/gomate/modules/generator/llm.py b/gomate/modules/generator/llm.py
@@ -151,7 +151,7 @@ def chat(self, prompt: str, history: List = [], content: str = '', llm_only: boo
         if llm_only:
             prompt = prompt
         else:
-            prompt = PROMPT_TEMPLATE['Qwen_PROMPT_TEMPALTE'].format(question=prompt, context=content)
+            prompt = PROMPT_TEMPLATE['Xunfei_PROMPT_TEMPLATE'].format(question=prompt, context=content)
         print(prompt)
         inputs = self.tokenizer.apply_chat_template([{"role": "user", "content": prompt}],
                                                     add_generation_prompt=True,

diff --git a/resources/wechat.png b/resources/wechat.png