Skip to content

Commit

Permalink
Merge pull request #84 from gomate-community/pipeline
Browse files Browse the repository at this point in the history
Pipeline@Mardown Parse
  • Loading branch information
yanqiangmiffy authored Jan 8, 2025
2 parents 012d590 + 14be90c commit a28fff2
Show file tree
Hide file tree
Showing 22 changed files with 811 additions and 190 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ data/nltk_data.zip
examples/retrievers/indexs
output
examples/rag/indexs
examples/rag/mobile_rag.py
examples/rag/mobile_rag.py
**/.ipynb_checkpoints/
.virtual_documents/
1 change: 1 addition & 0 deletions api/apps/config/app_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ class AppConfig:
DEBUGGER: bool = True

SHOW_DOCS: bool = True

3 changes: 3 additions & 0 deletions api/apps/config/rerank_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@
@software: PyCharm
@description: coding..
"""
class RerankConfig():
model_name_or_path:str="/data/users/searchgpt/pretrained_models/bge-reranker-large"
llm_url:str="http://10.208.63.29:8888"
10 changes: 6 additions & 4 deletions api/apps/core/judge/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,23 @@
"""
import loguru
from fastapi import APIRouter

from api.apps.config.rerank_config import RerankConfig
from api.apps.core.judge.bodys import JudgeBody
from api.apps.handle.response.json_response import ApiResponse
from trustrag.modules.judger.bge_judger import BgeJudger, BgeJudgerConfig
from trustrag.modules.judger.chatgpt_judger import OpenaiJudger, OpenaiJudgerConfig

judge_router = APIRouter()

rerank_config = RerankConfig()

judge_config = BgeJudgerConfig(
model_name_or_path="/data/users/searchgpt/pretrained_models/bge-reranker-large"
model_name_or_path=rerank_config.model_name_or_path
)
bge_judger = BgeJudger(judge_config)

judger_config = OpenaiJudgerConfig(
api_url="https://aicloud.oneainexus.cn:30013/inference/aicloud-yanqiang/gomatellm/"
# api_url="https://aicloud.oneainexus.cn:30013/inference/aicloud-yanqiang/gomatellm/"
api_url=rerank_config.llm_url
)
openai_judger = OpenaiJudger(judger_config)

Expand Down
2 changes: 1 addition & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

# 修改成自己的配置!!!
app_config = ApplicationConfig()
app_config.docs_path = "/data/users/searchgpt/yq/TrustRAG/data/docs/"
app_config.docs_path = "/data/users/searchgpt/yq/trustrag/data/docs/"
app_config.llm_model_path = "/data/users/searchgpt/pretrained_models/glm-4-9b-chat"

retriever_config = DenseRetrieverConfig(
Expand Down
3 changes: 2 additions & 1 deletion examples/judger/chatgpt_judger.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
data = json.load(f)

judger_config = OpenaiJudgerConfig(
api_url="https://aicloud.oneainexus.cn:30013/inference/aicloud-yanqiang/gomatellm/"
# api_url="https://aicloud.oneainexus.cn:30013/inference/aicloud-yanqiang/gomatellm/"
api_url="http://10.208.63.29:8888"
)
openai_judger = OpenaiJudger(judger_config)

Expand Down
12 changes: 0 additions & 12 deletions examples/parsers/common_parser.py

This file was deleted.

24 changes: 24 additions & 0 deletions examples/parsers/common_parser_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Example usage: parse common files

from trustrag.modules.document.common_parser import CommonParser
from trustrag.modules.document.chunk import TextChunker
if __name__ == '__main__':
cp=CommonParser()
tc=TextChunker()

doc_paths=[
"../../data/docs/基础知识.md",
"../../data/docs/5G垂直行业基础知识介绍--口袋小册子.pdf"
"../../data/docs/5G专网需求提问方式-广东.xlsx"
]
for doc_path in doc_paths:
# contents=cp.parse("../../data/docs/基础知识.md")
# paragraphs=cp.parse("../../data/docs/5G垂直行业基础知识介绍--口袋小册子.pdf")
paragraphs=cp.parse("../../data/docs/5G专网需求提问方式-广东.xlsx")
chunks=tc.chunk_sentences(paragraphs,chunk_size=256)
# print(chunks)
print(len(chunks))

for chunk in chunks:
print(chunk)
print("+++"*100)
18 changes: 9 additions & 9 deletions examples/parsers/markdown_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@
"""
@author:quincy qiang
@license: Apache Licence
@file: markdown_parser.py
@file: pdfparser_example.py
@time: 2024/06/06
@contact: yanqiangmiffy@gamil.com
@software: PyCharm
@description: coding..
"""
from trustrag.modules.document.markdown_parser import MarkdownParser
from langchain_community.document_loaders import UnstructuredMarkdownLoader


if __name__ == '__main__':
markdown_parser=MarkdownParser(max_chunk_size=100)

chunks=markdown_parser.get_chunks(filepath="../../data/docs/bm25算法.md")

print(len(chunks))

for chunk in chunks:
print(chunk.page_content)
parser=MarkdownParser()
paragraphs= parser.parse(fnm="../../data/docs/基础知识.md")
# print(chunks)
print(len(paragraphs))
for chunk in paragraphs:
print("==="*10)
print(chunk)
7 changes: 0 additions & 7 deletions examples/parsers/parser_examples.py

This file was deleted.

2 changes: 1 addition & 1 deletion examples/parsers/pdfparser_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# for chunk in chunks:
# print(chunk)

chunks = parser.parse(fnm="../../data/docs/计算所现行制度汇编202406/计算所现行制度汇编202406/综合处/中国科学院计算技术研究所综合安全管理制度_20240531修订版.pdf")
chunks = parser.parse(fnm="../../data/docs/5G垂直行业基础知识介绍--口袋小册子.pdf")
print(chunks)
print(len(chunks))
for chunk in chunks:
Expand Down
3 changes: 2 additions & 1 deletion examples/parsers/pdfparser_mineru.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from trustrag.modules.document.utils import PROJECT_BASE
from tqdm import tqdm
if __name__ == '__main__':
pdf_parser=PdfParserWithMinerU(url='http://localhost:8888/pdf_parse')
# pdf_parser=PdfParserWithMinerU(url='http://localhost:8888/pdf_parse')
pdf_parser=PdfParserWithMinerU(url='https://aicloud.oneainexus.cn:30013/inference/aicloud-yanqiang/mineru/pdf_parse')
pdf__path= f'{PROJECT_BASE}/data/competitions/df/A_document'
for filename in tqdm(os.listdir(pdf__path)):
if filename.endswith('.pdf'):
Expand Down
15 changes: 4 additions & 11 deletions examples/parsers/textparser_exmaple.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,16 @@
@software: PyCharm
@description: coding..
"""
from trustrag.modules.document.text_parser import TextParser
from trustrag.modules.document.txt_parser import TextParser




if __name__ == '__main__':
text_parser=TextParser(
max_chunk_size=512
)

# chunks=text_parser.get_chunks(
# filepath="../../data/docs/制度汇编.txt"
# )
chunks = text_parser.get_chunks(
filepath="H:/2024-Xfyun-RAG/data/corpus.txt/corpus.txt"
)
text_parser=TextParser()
chunks = text_parser.parse(fnm="../../data/docs/sample.txt")
print(len(chunks))

for chunk in chunks:
print("=="*100)
print(chunk)
Loading

0 comments on commit a28fff2

Please sign in to comment.