Merge pull request #67 from gomate-community/pipeline

features@完成构建
gomate-community · Sep 25, 2024 · f3178e5 · f3178e5
2 parents 44faa76 + ed55b9a
commit f3178e5
Show file tree

Hide file tree

Showing 8 changed files with 481 additions and 84 deletions.
diff --git a/README.md b/README.md
@@ -25,8 +25,9 @@ GoMate框架的设计核心在于其**高度的可配置性和模块化**，使
 
 可靠的输入，可信的输出
 
-## 🏗️ 更新记录
+## 🎉 更新记录
 
+- gomate打包构建，支持pip和source两种方式安装
 - 添加[MinerU文档解析](https://github.com/gomate-community/GoMate/blob/main/docs/mineru.md)
   ：一站式开源高质量数据提取工具，支持PDF/网页/多格式电子书提取`[20240907] `
 - RAPTOR:递归树检索器实现

diff --git a/api/apps/core/parser/views.py b/api/apps/core/parser/views.py
@@ -20,7 +20,7 @@
 from gomate.modules.document.docx_parser import DocxParser
 from gomate.modules.document.excel_parser import ExcelParser
 from gomate.modules.document.html_parser import HtmlParser
-from gomate.modules.document.pdf_parser import PdfSimParser
+from gomate.modules.document.pdf_parser_fast import PdfSimParser
 from gomate.modules.document.ppt_parser import PptParser
 from gomate.modules.document.txt_parser import TextParser
 from gomate.modules.document.json_parser import JsonParser

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -0,0 +1,185 @@
+## GoMate快速上手教程
+
+## 🛠️ 安装
+
+### 方法1：使用`pip`安装
+
+1. 创建conda环境（可选）
+
+```sehll
+conda create -n gomate python=3.9
+conda activate gomate
+```
+
+2. 使用`pip`安装依赖
+
+```sehll
+pip install gomate   
+```
+
+### 方法2：源码安装
+
+1. 下载源码
+
+```shell
+git clone https://github.com/gomate-community/GoMate.git
+```
+
+2. 安装依赖
+
+```shell
+pip install -e . 
+```
+
+## 🚀 快速上手
+
+### 1 模块介绍📝
+
+```text
+├── applications
+├── modules
+|      ├── citation:答案与证据引用
+|      ├── document：文档解析与切块，支持多种文档类型
+|      ├── generator：生成器
+|      ├── judger：文档选择
+|      ├── prompt：提示语
+|      ├── refiner：信息总结
+|      ├── reranker：排序模块
+|      ├── retrieval：检索模块
+|      └── rewriter：改写模块
+```
+
+
+### 2 导入模块
+
+```python
+import pickle
+import pandas as pd
+from tqdm import tqdm
+
+from gomate.modules.document.chunk import TextChunker
+from gomate.modules.document.txt_parser import TextParser
+from gomate.modules.document.utils import PROJECT_BASE
+from gomate.modules.generator.llm import GLM4Chat
+from gomate.modules.reranker.bge_reranker import BgeRerankerConfig, BgeReranker
+from gomate.modules.retrieval.bm25s_retriever import BM25RetrieverConfig
+from gomate.modules.retrieval.dense_retriever import DenseRetrieverConfig
+from gomate.modules.retrieval.hybrid_retriever import HybridRetriever, HybridRetrieverConfig
+```
+
+
+### 3 文档解析以及切片
+
+```text
+def generate_chunks():
+    tp = TextParser()# 代表txt格式解析
+    tc = TextChunker()
+    paragraphs = tp.parse(r'H:/2024-Xfyun-RAG/data/corpus.txt', encoding="utf-8")
+    print(len(paragraphs))
+    chunks = []
+    for content in tqdm(paragraphs):
+        chunk = tc.chunk_sentences([content], chunk_size=1024)
+        chunks.append(chunk)
+
+    with open(f'{PROJECT_BASE}/output/chunks.pkl', 'wb') as f:
+        pickle.dump(chunks, f)
+```
+>corpus.txt每行为一段新闻，可以自行选取paragraph读取的逻辑
+
+`TextChunker`为文本块切块程序，主要特点使用[InfiniFlow/huqie](https://huggingface.co/InfiniFlow/huqie)作为文本检索的分词器，适合RAG场景。
+
+
+### 4 构建检索器
+
+**配置检索器：**
+
+下面是一个混合检索器`HybridRetriever`配置参考，其中`HybridRetrieverConfig`需要由`BM25RetrieverConfig`和`DenseRetrieverConfig`配置构成。
+
+```python
+# BM25 and Dense Retriever configurations
+bm25_config = BM25RetrieverConfig(
+    method='lucene',
+    index_path='indexs/description_bm25.index',
+    k1=1.6,
+    b=0.7
+)
+bm25_config.validate()
+print(bm25_config.log_config())
+dense_config = DenseRetrieverConfig(
+    model_name_or_path=embedding_model_path,
+    dim=1024,
+    index_path='indexs/dense_cache'
+)
+config_info = dense_config.log_config()
+print(config_info)
+# Hybrid Retriever configuration
+# 由于分数框架不在同一维度，建议可以合并
+hybrid_config = HybridRetrieverConfig(
+    bm25_config=bm25_config,
+    dense_config=dense_config,
+    bm25_weight=0.7,  # bm25检索结果权重
+    dense_weight=0.3  # dense检索结果权重
+)
+hybrid_retriever = HybridRetriever(config=hybrid_config)
+```
+
+**构建索引：**
+
+````python
+# 构建索引
+hybrid_retriever.build_from_texts(corpus)
+# 保存索引
+hybrid_retriever.save_index()
+````
+
+如果构建好索引之后，可以多次使用，直接跳过上面步骤，加载索引
+```text
+hybrid_retriever.load_index()
+```
+
+**检索测试：**
+
+```python
+query = "支付宝"
+results = hybrid_retriever.retrieve(query, top_k=10)
+print(len(results))
+# Output results
+for result in results:
+    print(f"Text: {result['text']}, Score: {result['score']}")
+```
+
+### 5 排序模型
+```python
+reranker_config = BgeRerankerConfig(
+    model_name_or_path=reranker_model_path
+)
+bge_reranker = BgeReranker(reranker_config)
+```
+### 6 生成器配置
+```python
+glm4_chat = GLM4Chat(llm_model_path)
+```
+
+### 6 检索问答
+
+```python
+# ====================检索问答=========================
+test = pd.read_csv(test_path)
+answers = []
+for question in tqdm(test['question'], total=len(test)):
+    search_docs = hybrid_retriever.retrieve(question, top_k=10)
+    search_docs = bge_reranker.rerank(
+        query=question,
+        documents=[doc['text'] for idx, doc in enumerate(search_docs)]
+    )
+    # print(search_docs)
+    content = '\n'.join([f'信息[{idx}]：' + doc['text'] for idx, doc in enumerate(search_docs)])
+    answer = glm4_chat.chat(prompt=question, content=content)
+    answers.append(answer[0])
+    print(question)
+    print(answer[0])
+    print("************************************/n")
+test['answer'] = answers
+
+test[['answer']].to_csv(f'{PROJECT_BASE}/output/gomate_baseline.csv', index=False)
+```
diff --git a/docs/requirements-example.txt b/docs/requirements-example.txt
@@ -0,0 +1,176 @@
+accelerate==0.34.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.6
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.6.0
+async-timeout==4.0.3
+atlastk==0.13.3
+attrs==24.2.0
+beautifulsoup4==4.12.3
+bm25s==0.2.1
+certifi==2024.8.30
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpickle==3.0.0
+codecov==2.1.13
+contourpy==1.3.0
+coverage==7.6.1
+cryptography==43.0.1
+cycler==0.12.1
+datasets==3.0.0
+datrie==0.8.2
+dill==0.3.8
+distro==1.9.0
+et-xmlfile==1.1.0
+exceptiongroup==1.2.2
+faiss-cpu==1.8.0.post1
+fastapi==0.115.0
+ffmpy==0.4.0
+filelock==3.16.1
+FlagEmbedding==1.2.11
+flake8==7.1.1
+flake8-docstrings==1.7.0
+fonttools==4.54.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+future==1.0.0
+gradio==4.44.0
+gradio_client==1.3.0
+greenlet==3.1.1
+h11==0.14.0
+h5py==3.11.0
+hanziconv==0.3.2
+html2text==2024.2.26
+html_text==0.6.2
+httpcore==1.0.5
+httpx==0.27.2
+huggingface-hub==0.25.1
+hyperopt==0.2.7
+idna==3.10
+importlib_resources==6.4.5
+iniconfig==2.0.0
+jieba==0.42.1
+Jinja2==3.1.4
+jiter==0.5.0
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+kiwisolver==1.4.7
+langchain==0.3.0
+langchain-core==0.3.5
+langchain-huggingface==0.1.0
+langchain-openai==0.2.0
+langchain-text-splitters==0.3.0
+langsmith==0.1.128
+llvmlite==0.43.0
+loguru==0.7.2
+lxml==5.3.0
+lxml_html_clean==0.2.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mccabe==0.7.0
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+networkx==3.2.1
+nltk==3.9.1
+numba==0.60.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.68
+nvidia-nvtx-cu12==12.1.105
+openai==1.47.1
+openpyxl==3.1.5
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.3
+pdfminer.six==20231228
+pdfplumber==0.11.4
+peft==0.12.0
+pillow==10.4.0
+pluggy==1.5.0
+protobuf==5.28.2
+psutil==6.0.0
+py4j==0.10.9.7
+pyarrow==17.0.0
+pycodestyle==2.12.1
+pycparser==2.22
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydocstyle==6.3.0
+pydub==0.25.1
+pyflakes==3.2.0
+Pygments==2.18.0
+PyMuPDF==1.24.10
+PyMuPDFb==1.24.10
+pynndescent==0.5.13
+pyparsing==3.1.4
+PyPDF2==3.0.1
+pypdfium2==4.30.0
+pytest==8.3.3
+pytest-cov==5.0.0
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-magic==0.4.27
+python-multipart==0.0.10
+python-pptx==1.0.2
+pytz==2024.2
+PyYAML==6.0.2
+readability==0.3.1
+regex==2024.9.11
+requests==2.32.3
+rich==13.8.1
+ruff==0.6.7
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.13.1
+semantic-version==2.10.0
+sentence-transformers==3.1.1
+sentencepiece==0.2.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+snowballstemmer==2.2.0
+soupsieve==2.6
+SQLAlchemy==2.0.35
+starlette==0.38.6
+sympy==1.13.3
+tenacity==8.5.0
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+tokenizers==0.19.1
+tomli==2.0.1
+tomlkit==0.12.0
+torch==2.4.1
+tortoise==0.1.1
+tqdm==4.66.5
+transformers==4.44.2
+triton==3.0.0
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.2
+umap-learn==0.5.6
+urllib3==2.2.3
+uvicorn==0.30.6
+websockets==12.0
+xgboost==2.1.1
+XlsxWriter==3.2.0
+xpinyin==0.7.6
+xxhash==3.5.0
+yarl==1.12.1
+zipp==3.20.2