Skip to content

Commit

Permalink
Merge pull request #64 from gomate-community/pipeline
Browse files Browse the repository at this point in the history
Pipeline
  • Loading branch information
yanqiangmiffy authored Sep 23, 2024
2 parents 650fc8e + 3c63482 commit 47faba4
Showing 7 changed files with 70 additions and 54 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -21,4 +21,5 @@ data/huqie.zip
data/nltk_data.zip
examples/retrievers/indexs
output
examples/rag/indexs
examples/rag/indexs
examples/rag/mobile_rag.py
11 changes: 6 additions & 5 deletions gomate/modules/citation/match_citation.py
Original file line number Diff line number Diff line change
@@ -64,6 +64,7 @@ def ground_response(
]
},
"""
print(selected_docs)
sentences = self.cut(response)
final_response = []
selected_idx = [i - 1 for i in selected_idx]
@@ -96,14 +97,14 @@ def ground_response(
highlighted_start_end = self.highlight_common_substrings(sentence, evidence_sentence,evidence)
quote_list.append(
{
"doc_id": 90564, # 文件id
"chk_id": best_idx, # 切片索引(从0开始)
"doc_id": selected_docs[i]["doc_id"], # 文件id
"chk_id": selected_docs[i]["chk_id"], # 切片索引(从0开始)
# 非文内溯源知识集合无需返回
"doc_source": "新闻来源",
"doc_source": selected_docs[i]["newsinfo"]["source"],
# 新闻时间, 非文内溯源知识集合无需返回
"doc_date": "2021-10-19",
"doc_date": selected_docs[i]["newsinfo"]["date"],
# 非文内溯源知识集合无需返回
"doc_title": "新闻标题",
"doc_title": selected_docs[i]["newsinfo"]["title"],
# 非文内溯源知识集合无需返回
"chk_content": evidence,
"best_ratio": best_ratio,
2 changes: 1 addition & 1 deletion gomate/modules/document/common_parser.py
Original file line number Diff line number Diff line change
@@ -45,5 +45,5 @@ def parse(self, file_path):
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
contents = parser.parse(content)
# loguru.logger.info(contents)
contents = self.tc.chunk_sentences(contents, chunk_size=512)
# contents = self.tc.chunk_sentences(contents, chunk_size=512)
return contents
2 changes: 1 addition & 1 deletion gomate/modules/document/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1173,5 +1173,5 @@ def remove_tag(txt):

if __name__ == "__main__":
pp=PdfSimParser()
contents=pp.parse('/data/users/searchgpt/yq/GoMate_dev/data/docs/新冠肺炎疫情.pdf')
contents=pp.parse('/data/users/searchgpt/yq/GoMate_dev/data/competitions/df/A_document/AZ06.pdf')
print(contents)
4 changes: 2 additions & 2 deletions gomate/modules/document/rag_tokenizer.py
Original file line number Diff line number Diff line change
@@ -26,10 +26,10 @@

class RagTokenizer:
def key_(self, line):
return str(line.lower().encode("utf-8"))[2:-1]
return str(line.lower().encode("utf-8", 'ignore'))[2:-1]

def rkey_(self, line):
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
return str(("DD" + (line[::-1].lower())).encode("utf-8", 'ignore'))[2:-1]

def loadDict_(self, fnm):
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
20 changes: 17 additions & 3 deletions gomate/modules/generator/llm.py
Original file line number Diff line number Diff line change
@@ -71,7 +71,19 @@
提问:{question}
相关资料:{context}
"""
""",
DF_PROMPT_TEMPLATE="""请结合参考的上下文内容回答用户问题,确保答案的准确性、全面性和权威性。如果上下文不能支撑用户问题,或者没有相关信息,请明确说明问题无法回答,避免生成虚假信息。
只输出答案,尽量包括关键词,不要输出额外内容,不要过多解释,不要输出额外无关文字以及过多修饰。
如果给定的上下文无法让你做出回答,请直接回答:“无法回答。”,不要输出额外内容。
问题: {question}
可参考的上下文:
···
{context}
···
简明准确的回答:
""",

)

@@ -153,8 +165,10 @@ def chat(self, prompt: str, history: List = [], content: str = '', llm_only: boo
if llm_only:
prompt = prompt
else:
prompt = PROMPT_TEMPLATE['Xunfei_PROMPT_TEMPLATE'].format(question=prompt, context=content)
prompt = PROMPT_TEMPLATE['Xunfei_PROMPT_TEMPLATE2'].format(question=prompt, context=content)
prompt=prompt.encode("utf-8", 'ignore').decode('utf-8','ignore')
print(prompt)

inputs = self.tokenizer.apply_chat_template([{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=True,
@@ -163,7 +177,7 @@ def chat(self, prompt: str, history: List = [], content: str = '', llm_only: boo
)

inputs = inputs.to('cuda')
gen_kwargs = {"max_length": 16000, "do_sample": False, "top_k": 1}
gen_kwargs = {"max_length": 20000, "do_sample": False, "top_k": 1}
with torch.no_grad():
outputs = self.model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
82 changes: 41 additions & 41 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,47 +1,47 @@
tqdm >= 4.23.4
hyperopt >= 0.1.1
h5py >= 2.8.0
coverage >= 4.3.4
codecov >= 2.0.15
pytest >= 3.7.4
pytest-cov >= 2.4.0
flake8 == 7.0.0
flake8_docstrings == 1.7.0
pydocstyle == 2.1
openai == 1.33.0
datasets == 2.16.1
langchain==0.2.0
langchain-core==0.2.5
langchain-text-splitters==0.2.1
langchain-huggingface==0.0.3
langchain-openai==0.1.8
langsmith>=0.1.61
tqdm
hyperopt
h5py
coverage
codecov
pytest
pytest-cov
flake8
flake8_docstrings
pydocstyle
openai
datasets
langchain
langchain-core
langchain-text-splitters
langchain-huggingface
langchain-openai
langsmith
transformers
pandas
nltk == 3.8.1
sentencepiece==0.2.0
PyPDF2==3.0.1
html2text==2024.2.26
beautifulsoup4==4.12.3
faiss-cpu==1.8.0
umap-learn==0.5.5
sentence_transformers==3.0.0
threadpoolctl==3.5.0
PyMuPDF==1.24.5
hanziconv==0.3.2
datrie==0.8.2
xpinyin==0.7.6
python-pptx==0.6.23
pdfplumber==0.11.0
readability==0.3.1
html_text==0.6.2
python-docx==1.1.2
tortoise==0.1.1
python-magic==0.4.27
html_text==0.6.2
readability==0.3.1
nltk
sentencepiece
PyPDF2
html2text
beautifulsoup4
faiss-cpu
umap-learn
sentence_transformers
threadpoolctl
PyMuPDF
hanziconv
datrie
xpinyin
python-pptx
pdfplumber
readability
html_text
python-docx
tortoise
python-magic
html_text
readability
PyMuPDF
hanziconv==0.3.2
hanziconv
PyPDF2
gradio
loguru

0 comments on commit 47faba4

Please sign in to comment.