From d0f3723f995b98aa0741b47c6222afe22eabbed3 Mon Sep 17 00:00:00 2001 From: yanqiangmiffy Date: Fri, 20 Sep 2024 16:26:40 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feature@=E4=BF=AE=E6=94=B9=E5=BC=95?= =?UTF-8?q?=E7=94=A8=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gomate/modules/citation/match_citation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gomate/modules/citation/match_citation.py b/gomate/modules/citation/match_citation.py index e999424..2a167c5 100644 --- a/gomate/modules/citation/match_citation.py +++ b/gomate/modules/citation/match_citation.py @@ -96,14 +96,14 @@ def ground_response( highlighted_start_end = self.highlight_common_substrings(sentence, evidence_sentence,evidence) quote_list.append( { - "doc_id": 90564, # 文件id + "doc_id": selected_docs[i]["newsinfo"]["id"], # 文件id "chk_id": best_idx, # 切片索引(从0开始) # 非文内溯源知识集合无需返回 - "doc_source": "新闻来源", + "doc_source": selected_docs[i]["newsinfo"]["id"], # 新闻时间, 非文内溯源知识集合无需返回 - "doc_date": "2021-10-19", + "doc_date": selected_docs[i]["newsinfo"]["date"], # 非文内溯源知识集合无需返回 - "doc_title": "新闻标题", + "doc_title": selected_docs[i]["newsinfo"]["title"], # 非文内溯源知识集合无需返回 "chk_content": evidence, "best_ratio": best_ratio, From 3c634820ef8df6f19cd4ad450724492d708fa489 Mon Sep 17 00:00:00 2001 From: yanqiangmiffy <1185918903@qq.com> Date: Mon, 23 Sep 2024 16:21:29 +0800 Subject: [PATCH 2/2] update --- .gitignore | 3 +- gomate/modules/citation/match_citation.py | 7 +- gomate/modules/document/common_parser.py | 2 +- gomate/modules/document/pdf_parser.py | 2 +- gomate/modules/document/rag_tokenizer.py | 4 +- gomate/modules/generator/llm.py | 20 +++++- requirements.txt | 82 +++++++++++------------ 7 files changed, 68 insertions(+), 52 deletions(-) diff --git a/.gitignore b/.gitignore index 752d004..2356d14 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,5 @@ data/huqie.zip data/nltk_data.zip examples/retrievers/indexs output -examples/rag/indexs \ No newline at end of file +examples/rag/indexs +examples/rag/mobile_rag.py \ No newline at end of file diff --git a/gomate/modules/citation/match_citation.py b/gomate/modules/citation/match_citation.py index 2a167c5..85b6d4e 100644 --- a/gomate/modules/citation/match_citation.py +++ b/gomate/modules/citation/match_citation.py @@ -64,6 +64,7 @@ def ground_response( ] }, """ + print(selected_docs) sentences = self.cut(response) final_response = [] selected_idx = [i - 1 for i in selected_idx] @@ -96,10 +97,10 @@ def ground_response( highlighted_start_end = self.highlight_common_substrings(sentence, evidence_sentence,evidence) quote_list.append( { - "doc_id": selected_docs[i]["newsinfo"]["id"], # 文件id - "chk_id": best_idx, # 切片索引(从0开始) + "doc_id": selected_docs[i]["doc_id"], # 文件id + "chk_id": selected_docs[i]["chk_id"], # 切片索引(从0开始) # 非文内溯源知识集合无需返回 - "doc_source": selected_docs[i]["newsinfo"]["id"], + "doc_source": selected_docs[i]["newsinfo"]["source"], # 新闻时间, 非文内溯源知识集合无需返回 "doc_date": selected_docs[i]["newsinfo"]["date"], # 非文内溯源知识集合无需返回 diff --git a/gomate/modules/document/common_parser.py b/gomate/modules/document/common_parser.py index 603e4be..ee3e5aa 100644 --- a/gomate/modules/document/common_parser.py +++ b/gomate/modules/document/common_parser.py @@ -45,5 +45,5 @@ def parse(self, file_path): "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") contents = parser.parse(content) # loguru.logger.info(contents) - contents = self.tc.chunk_sentences(contents, chunk_size=512) + # contents = self.tc.chunk_sentences(contents, chunk_size=512) return contents diff --git a/gomate/modules/document/pdf_parser.py b/gomate/modules/document/pdf_parser.py index 48824d4..042733e 100644 --- a/gomate/modules/document/pdf_parser.py +++ b/gomate/modules/document/pdf_parser.py @@ -1173,5 +1173,5 @@ def remove_tag(txt): if __name__ == "__main__": pp=PdfSimParser() - contents=pp.parse('/data/users/searchgpt/yq/GoMate_dev/data/docs/新冠肺炎疫情.pdf') + contents=pp.parse('/data/users/searchgpt/yq/GoMate_dev/data/competitions/df/A_document/AZ06.pdf') print(contents) \ No newline at end of file diff --git a/gomate/modules/document/rag_tokenizer.py b/gomate/modules/document/rag_tokenizer.py index aa3fa22..0cdcc97 100644 --- a/gomate/modules/document/rag_tokenizer.py +++ b/gomate/modules/document/rag_tokenizer.py @@ -26,10 +26,10 @@ class RagTokenizer: def key_(self, line): - return str(line.lower().encode("utf-8"))[2:-1] + return str(line.lower().encode("utf-8", 'ignore'))[2:-1] def rkey_(self, line): - return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1] + return str(("DD" + (line[::-1].lower())).encode("utf-8", 'ignore'))[2:-1] def loadDict_(self, fnm): print("[HUQIE]:Build trie", fnm, file=sys.stderr) diff --git a/gomate/modules/generator/llm.py b/gomate/modules/generator/llm.py index 61f4f4e..6edae3e 100644 --- a/gomate/modules/generator/llm.py +++ b/gomate/modules/generator/llm.py @@ -71,7 +71,19 @@ 提问:{question} 相关资料:{context} - """ + """, + DF_PROMPT_TEMPLATE="""请结合参考的上下文内容回答用户问题,确保答案的准确性、全面性和权威性。如果上下文不能支撑用户问题,或者没有相关信息,请明确说明问题无法回答,避免生成虚假信息。 + 只输出答案,尽量包括关键词,不要输出额外内容,不要过多解释,不要输出额外无关文字以及过多修饰。 + + 如果给定的上下文无法让你做出回答,请直接回答:“无法回答。”,不要输出额外内容。 + + 问题: {question} + 可参考的上下文: + ··· + {context} + ··· + 简明准确的回答: + """, ) @@ -153,8 +165,10 @@ def chat(self, prompt: str, history: List = [], content: str = '', llm_only: boo if llm_only: prompt = prompt else: - prompt = PROMPT_TEMPLATE['Xunfei_PROMPT_TEMPLATE'].format(question=prompt, context=content) + prompt = PROMPT_TEMPLATE['Xunfei_PROMPT_TEMPLATE2'].format(question=prompt, context=content) + prompt=prompt.encode("utf-8", 'ignore').decode('utf-8','ignore') print(prompt) + inputs = self.tokenizer.apply_chat_template([{"role": "user", "content": prompt}], add_generation_prompt=True, tokenize=True, @@ -163,7 +177,7 @@ def chat(self, prompt: str, history: List = [], content: str = '', llm_only: boo ) inputs = inputs.to('cuda') - gen_kwargs = {"max_length": 16000, "do_sample": False, "top_k": 1} + gen_kwargs = {"max_length": 20000, "do_sample": False, "top_k": 1} with torch.no_grad(): outputs = self.model.generate(**inputs, **gen_kwargs) outputs = outputs[:, inputs['input_ids'].shape[1]:] diff --git a/requirements.txt b/requirements.txt index 9da1e32..c784e7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,47 +1,47 @@ -tqdm >= 4.23.4 -hyperopt >= 0.1.1 -h5py >= 2.8.0 -coverage >= 4.3.4 -codecov >= 2.0.15 -pytest >= 3.7.4 -pytest-cov >= 2.4.0 -flake8 == 7.0.0 -flake8_docstrings == 1.7.0 -pydocstyle == 2.1 -openai == 1.33.0 -datasets == 2.16.1 -langchain==0.2.0 -langchain-core==0.2.5 -langchain-text-splitters==0.2.1 -langchain-huggingface==0.0.3 -langchain-openai==0.1.8 -langsmith>=0.1.61 +tqdm +hyperopt +h5py +coverage +codecov +pytest +pytest-cov +flake8 +flake8_docstrings +pydocstyle +openai +datasets +langchain +langchain-core +langchain-text-splitters +langchain-huggingface +langchain-openai +langsmith transformers pandas -nltk == 3.8.1 -sentencepiece==0.2.0 -PyPDF2==3.0.1 -html2text==2024.2.26 -beautifulsoup4==4.12.3 -faiss-cpu==1.8.0 -umap-learn==0.5.5 -sentence_transformers==3.0.0 -threadpoolctl==3.5.0 -PyMuPDF==1.24.5 -hanziconv==0.3.2 -datrie==0.8.2 -xpinyin==0.7.6 -python-pptx==0.6.23 -pdfplumber==0.11.0 -readability==0.3.1 -html_text==0.6.2 -python-docx==1.1.2 -tortoise==0.1.1 -python-magic==0.4.27 -html_text==0.6.2 -readability==0.3.1 +nltk +sentencepiece +PyPDF2 +html2text +beautifulsoup4 +faiss-cpu +umap-learn +sentence_transformers +threadpoolctl +PyMuPDF +hanziconv +datrie +xpinyin +python-pptx +pdfplumber +readability +html_text +python-docx +tortoise +python-magic +html_text +readability PyMuPDF -hanziconv==0.3.2 +hanziconv PyPDF2 gradio loguru