-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #67 from gomate-community/pipeline
features@完成构建
- Loading branch information
Showing
8 changed files
with
481 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
## GoMate快速上手教程 | ||
|
||
## 🛠️ 安装 | ||
|
||
### 方法1:使用`pip`安装 | ||
|
||
1. 创建conda环境(可选) | ||
|
||
```sehll | ||
conda create -n gomate python=3.9 | ||
conda activate gomate | ||
``` | ||
|
||
2. 使用`pip`安装依赖 | ||
|
||
```sehll | ||
pip install gomate | ||
``` | ||
|
||
### 方法2:源码安装 | ||
|
||
1. 下载源码 | ||
|
||
```shell | ||
git clone https://github.com/gomate-community/GoMate.git | ||
``` | ||
|
||
2. 安装依赖 | ||
|
||
```shell | ||
pip install -e . | ||
``` | ||
|
||
## 🚀 快速上手 | ||
|
||
### 1 模块介绍📝 | ||
|
||
```text | ||
├── applications | ||
├── modules | ||
| ├── citation:答案与证据引用 | ||
| ├── document:文档解析与切块,支持多种文档类型 | ||
| ├── generator:生成器 | ||
| ├── judger:文档选择 | ||
| ├── prompt:提示语 | ||
| ├── refiner:信息总结 | ||
| ├── reranker:排序模块 | ||
| ├── retrieval:检索模块 | ||
| └── rewriter:改写模块 | ||
``` | ||
|
||
|
||
### 2 导入模块 | ||
|
||
```python | ||
import pickle | ||
import pandas as pd | ||
from tqdm import tqdm | ||
|
||
from gomate.modules.document.chunk import TextChunker | ||
from gomate.modules.document.txt_parser import TextParser | ||
from gomate.modules.document.utils import PROJECT_BASE | ||
from gomate.modules.generator.llm import GLM4Chat | ||
from gomate.modules.reranker.bge_reranker import BgeRerankerConfig, BgeReranker | ||
from gomate.modules.retrieval.bm25s_retriever import BM25RetrieverConfig | ||
from gomate.modules.retrieval.dense_retriever import DenseRetrieverConfig | ||
from gomate.modules.retrieval.hybrid_retriever import HybridRetriever, HybridRetrieverConfig | ||
``` | ||
|
||
|
||
### 3 文档解析以及切片 | ||
|
||
```text | ||
def generate_chunks(): | ||
tp = TextParser()# 代表txt格式解析 | ||
tc = TextChunker() | ||
paragraphs = tp.parse(r'H:/2024-Xfyun-RAG/data/corpus.txt', encoding="utf-8") | ||
print(len(paragraphs)) | ||
chunks = [] | ||
for content in tqdm(paragraphs): | ||
chunk = tc.chunk_sentences([content], chunk_size=1024) | ||
chunks.append(chunk) | ||
with open(f'{PROJECT_BASE}/output/chunks.pkl', 'wb') as f: | ||
pickle.dump(chunks, f) | ||
``` | ||
>corpus.txt每行为一段新闻,可以自行选取paragraph读取的逻辑 | ||
`TextChunker`为文本块切块程序,主要特点使用[InfiniFlow/huqie](https://huggingface.co/InfiniFlow/huqie)作为文本检索的分词器,适合RAG场景。 | ||
|
||
|
||
### 4 构建检索器 | ||
|
||
**配置检索器:** | ||
|
||
下面是一个混合检索器`HybridRetriever`配置参考,其中`HybridRetrieverConfig`需要由`BM25RetrieverConfig`和`DenseRetrieverConfig`配置构成。 | ||
|
||
```python | ||
# BM25 and Dense Retriever configurations | ||
bm25_config = BM25RetrieverConfig( | ||
method='lucene', | ||
index_path='indexs/description_bm25.index', | ||
k1=1.6, | ||
b=0.7 | ||
) | ||
bm25_config.validate() | ||
print(bm25_config.log_config()) | ||
dense_config = DenseRetrieverConfig( | ||
model_name_or_path=embedding_model_path, | ||
dim=1024, | ||
index_path='indexs/dense_cache' | ||
) | ||
config_info = dense_config.log_config() | ||
print(config_info) | ||
# Hybrid Retriever configuration | ||
# 由于分数框架不在同一维度,建议可以合并 | ||
hybrid_config = HybridRetrieverConfig( | ||
bm25_config=bm25_config, | ||
dense_config=dense_config, | ||
bm25_weight=0.7, # bm25检索结果权重 | ||
dense_weight=0.3 # dense检索结果权重 | ||
) | ||
hybrid_retriever = HybridRetriever(config=hybrid_config) | ||
``` | ||
|
||
**构建索引:** | ||
|
||
````python | ||
# 构建索引 | ||
hybrid_retriever.build_from_texts(corpus) | ||
# 保存索引 | ||
hybrid_retriever.save_index() | ||
```` | ||
|
||
如果构建好索引之后,可以多次使用,直接跳过上面步骤,加载索引 | ||
```text | ||
hybrid_retriever.load_index() | ||
``` | ||
|
||
**检索测试:** | ||
|
||
```python | ||
query = "支付宝" | ||
results = hybrid_retriever.retrieve(query, top_k=10) | ||
print(len(results)) | ||
# Output results | ||
for result in results: | ||
print(f"Text: {result['text']}, Score: {result['score']}") | ||
``` | ||
|
||
### 5 排序模型 | ||
```python | ||
reranker_config = BgeRerankerConfig( | ||
model_name_or_path=reranker_model_path | ||
) | ||
bge_reranker = BgeReranker(reranker_config) | ||
``` | ||
### 6 生成器配置 | ||
```python | ||
glm4_chat = GLM4Chat(llm_model_path) | ||
``` | ||
|
||
### 6 检索问答 | ||
|
||
```python | ||
# ====================检索问答========================= | ||
test = pd.read_csv(test_path) | ||
answers = [] | ||
for question in tqdm(test['question'], total=len(test)): | ||
search_docs = hybrid_retriever.retrieve(question, top_k=10) | ||
search_docs = bge_reranker.rerank( | ||
query=question, | ||
documents=[doc['text'] for idx, doc in enumerate(search_docs)] | ||
) | ||
# print(search_docs) | ||
content = '\n'.join([f'信息[{idx}]:' + doc['text'] for idx, doc in enumerate(search_docs)]) | ||
answer = glm4_chat.chat(prompt=question, content=content) | ||
answers.append(answer[0]) | ||
print(question) | ||
print(answer[0]) | ||
print("************************************/n") | ||
test['answer'] = answers | ||
|
||
test[['answer']].to_csv(f'{PROJECT_BASE}/output/gomate_baseline.csv', index=False) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
accelerate==0.34.2 | ||
aiofiles==23.2.1 | ||
aiohappyeyeballs==2.4.0 | ||
aiohttp==3.10.6 | ||
aiosignal==1.3.1 | ||
annotated-types==0.7.0 | ||
anyio==4.6.0 | ||
async-timeout==4.0.3 | ||
atlastk==0.13.3 | ||
attrs==24.2.0 | ||
beautifulsoup4==4.12.3 | ||
bm25s==0.2.1 | ||
certifi==2024.8.30 | ||
cffi==1.17.1 | ||
chardet==5.2.0 | ||
charset-normalizer==3.3.2 | ||
click==8.1.7 | ||
cloudpickle==3.0.0 | ||
codecov==2.1.13 | ||
contourpy==1.3.0 | ||
coverage==7.6.1 | ||
cryptography==43.0.1 | ||
cycler==0.12.1 | ||
datasets==3.0.0 | ||
datrie==0.8.2 | ||
dill==0.3.8 | ||
distro==1.9.0 | ||
et-xmlfile==1.1.0 | ||
exceptiongroup==1.2.2 | ||
faiss-cpu==1.8.0.post1 | ||
fastapi==0.115.0 | ||
ffmpy==0.4.0 | ||
filelock==3.16.1 | ||
FlagEmbedding==1.2.11 | ||
flake8==7.1.1 | ||
flake8-docstrings==1.7.0 | ||
fonttools==4.54.1 | ||
frozenlist==1.4.1 | ||
fsspec==2024.6.1 | ||
future==1.0.0 | ||
gradio==4.44.0 | ||
gradio_client==1.3.0 | ||
greenlet==3.1.1 | ||
h11==0.14.0 | ||
h5py==3.11.0 | ||
hanziconv==0.3.2 | ||
html2text==2024.2.26 | ||
html_text==0.6.2 | ||
httpcore==1.0.5 | ||
httpx==0.27.2 | ||
huggingface-hub==0.25.1 | ||
hyperopt==0.2.7 | ||
idna==3.10 | ||
importlib_resources==6.4.5 | ||
iniconfig==2.0.0 | ||
jieba==0.42.1 | ||
Jinja2==3.1.4 | ||
jiter==0.5.0 | ||
joblib==1.4.2 | ||
jsonpatch==1.33 | ||
jsonpointer==3.0.0 | ||
kiwisolver==1.4.7 | ||
langchain==0.3.0 | ||
langchain-core==0.3.5 | ||
langchain-huggingface==0.1.0 | ||
langchain-openai==0.2.0 | ||
langchain-text-splitters==0.3.0 | ||
langsmith==0.1.128 | ||
llvmlite==0.43.0 | ||
loguru==0.7.2 | ||
lxml==5.3.0 | ||
lxml_html_clean==0.2.2 | ||
markdown-it-py==3.0.0 | ||
MarkupSafe==2.1.5 | ||
matplotlib==3.9.2 | ||
mccabe==0.7.0 | ||
mdurl==0.1.2 | ||
mpmath==1.3.0 | ||
multidict==6.1.0 | ||
multiprocess==0.70.16 | ||
networkx==3.2.1 | ||
nltk==3.9.1 | ||
numba==0.60.0 | ||
numpy==1.26.4 | ||
nvidia-cublas-cu12==12.1.3.1 | ||
nvidia-cuda-cupti-cu12==12.1.105 | ||
nvidia-cuda-nvrtc-cu12==12.1.105 | ||
nvidia-cuda-runtime-cu12==12.1.105 | ||
nvidia-cudnn-cu12==9.1.0.70 | ||
nvidia-cufft-cu12==11.0.2.54 | ||
nvidia-curand-cu12==10.3.2.106 | ||
nvidia-cusolver-cu12==11.4.5.107 | ||
nvidia-cusparse-cu12==12.1.0.106 | ||
nvidia-nccl-cu12==2.20.5 | ||
nvidia-nvjitlink-cu12==12.6.68 | ||
nvidia-nvtx-cu12==12.1.105 | ||
openai==1.47.1 | ||
openpyxl==3.1.5 | ||
orjson==3.10.7 | ||
packaging==24.1 | ||
pandas==2.2.3 | ||
pdfminer.six==20231228 | ||
pdfplumber==0.11.4 | ||
peft==0.12.0 | ||
pillow==10.4.0 | ||
pluggy==1.5.0 | ||
protobuf==5.28.2 | ||
psutil==6.0.0 | ||
py4j==0.10.9.7 | ||
pyarrow==17.0.0 | ||
pycodestyle==2.12.1 | ||
pycparser==2.22 | ||
pydantic==2.9.2 | ||
pydantic_core==2.23.4 | ||
pydocstyle==6.3.0 | ||
pydub==0.25.1 | ||
pyflakes==3.2.0 | ||
Pygments==2.18.0 | ||
PyMuPDF==1.24.10 | ||
PyMuPDFb==1.24.10 | ||
pynndescent==0.5.13 | ||
pyparsing==3.1.4 | ||
PyPDF2==3.0.1 | ||
pypdfium2==4.30.0 | ||
pytest==8.3.3 | ||
pytest-cov==5.0.0 | ||
python-dateutil==2.9.0.post0 | ||
python-docx==1.1.2 | ||
python-magic==0.4.27 | ||
python-multipart==0.0.10 | ||
python-pptx==1.0.2 | ||
pytz==2024.2 | ||
PyYAML==6.0.2 | ||
readability==0.3.1 | ||
regex==2024.9.11 | ||
requests==2.32.3 | ||
rich==13.8.1 | ||
ruff==0.6.7 | ||
safetensors==0.4.5 | ||
scikit-learn==1.5.2 | ||
scipy==1.13.1 | ||
semantic-version==2.10.0 | ||
sentence-transformers==3.1.1 | ||
sentencepiece==0.2.0 | ||
shellingham==1.5.4 | ||
six==1.16.0 | ||
sniffio==1.3.1 | ||
snowballstemmer==2.2.0 | ||
soupsieve==2.6 | ||
SQLAlchemy==2.0.35 | ||
starlette==0.38.6 | ||
sympy==1.13.3 | ||
tenacity==8.5.0 | ||
threadpoolctl==3.5.0 | ||
tiktoken==0.7.0 | ||
tokenizers==0.19.1 | ||
tomli==2.0.1 | ||
tomlkit==0.12.0 | ||
torch==2.4.1 | ||
tortoise==0.1.1 | ||
tqdm==4.66.5 | ||
transformers==4.44.2 | ||
triton==3.0.0 | ||
typer==0.12.5 | ||
typing_extensions==4.12.2 | ||
tzdata==2024.2 | ||
umap-learn==0.5.6 | ||
urllib3==2.2.3 | ||
uvicorn==0.30.6 | ||
websockets==12.0 | ||
xgboost==2.1.1 | ||
XlsxWriter==3.2.0 | ||
xpinyin==0.7.6 | ||
xxhash==3.5.0 | ||
yarl==1.12.1 | ||
zipp==3.20.2 |
Oops, something went wrong.