Merge pull request #64 from gomate-community/pipeline

Pipeline
gomate-community · Sep 23, 2024 · 47faba4 · 47faba4
2 parents 650fc8e + 3c63482
commit 47faba4
Showing 7 changed files with 70 additions and 54 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,4 +21,5 @@ data/huqie.zip
 data/nltk_data.zip
 examples/retrievers/indexs
 output
-examples/rag/indexs
+examples/rag/indexs
+examples/rag/mobile_rag.py
diff --git a/gomate/modules/citation/match_citation.py b/gomate/modules/citation/match_citation.py
@@ -64,6 +64,7 @@ def ground_response(
         ]
       },
         """
+        print(selected_docs)
         sentences = self.cut(response)
         final_response = []
         selected_idx = [i - 1 for i in selected_idx]
@@ -96,14 +97,14 @@ def ground_response(
                         highlighted_start_end = self.highlight_common_substrings(sentence, evidence_sentence,evidence)
                         quote_list.append(
                             {
-                                "doc_id": 90564,  # 文件id
-                                "chk_id": best_idx,  # 切片索引（从0开始）
+                                "doc_id": selected_docs[i]["doc_id"],  # 文件id
+                                "chk_id": selected_docs[i]["chk_id"],  # 切片索引（从0开始）
                                 # 非文内溯源知识集合无需返回
-                                "doc_source": "新闻来源",
+                                "doc_source": selected_docs[i]["newsinfo"]["source"],
                                 # 新闻时间, 非文内溯源知识集合无需返回
-                                "doc_date": "2021-10-19",
+                                "doc_date": selected_docs[i]["newsinfo"]["date"],
                                 # 非文内溯源知识集合无需返回
-                                "doc_title": "新闻标题",
+                                "doc_title": selected_docs[i]["newsinfo"]["title"],
                                 # 非文内溯源知识集合无需返回
                                 "chk_content": evidence,
                                 "best_ratio": best_ratio,

diff --git a/gomate/modules/document/common_parser.py b/gomate/modules/document/common_parser.py
@@ -45,5 +45,5 @@ def parse(self, file_path):
                 "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
         contents = parser.parse(content)
         # loguru.logger.info(contents)
-        contents = self.tc.chunk_sentences(contents, chunk_size=512)
+        # contents = self.tc.chunk_sentences(contents, chunk_size=512)
         return contents
diff --git a/gomate/modules/document/pdf_parser.py b/gomate/modules/document/pdf_parser.py
@@ -1173,5 +1173,5 @@ def remove_tag(txt):
 
 if __name__ == "__main__":
     pp=PdfSimParser()
-    contents=pp.parse('/data/users/searchgpt/yq/GoMate_dev/data/docs/新冠肺炎疫情.pdf')
+    contents=pp.parse('/data/users/searchgpt/yq/GoMate_dev/data/competitions/df/A_document/AZ06.pdf')
     print(contents)
diff --git a/gomate/modules/document/rag_tokenizer.py b/gomate/modules/document/rag_tokenizer.py
@@ -26,10 +26,10 @@
 
 class RagTokenizer:
     def key_(self, line):
-        return str(line.lower().encode("utf-8"))[2:-1]
+        return str(line.lower().encode("utf-8", 'ignore'))[2:-1]
 
     def rkey_(self, line):
-        return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
+        return str(("DD" + (line[::-1].lower())).encode("utf-8", 'ignore'))[2:-1]
 
     def loadDict_(self, fnm):
         print("[HUQIE]:Build trie", fnm, file=sys.stderr)

diff --git a/gomate/modules/generator/llm.py b/gomate/modules/generator/llm.py
@@ -71,7 +71,19 @@
     提问：{question}
     
     相关资料：{context}
-    """
+    """,
+    DF_PROMPT_TEMPLATE="""请结合参考的上下文内容回答用户问题，确保答案的准确性、全面性和权威性。如果上下文不能支撑用户问题，或者没有相关信息，请明确说明问题无法回答，避免生成虚假信息。
+    只输出答案，尽量包括关键词，不要输出额外内容，不要过多解释，不要输出额外无关文字以及过多修饰。
+
+    如果给定的上下文无法让你做出回答，请直接回答：“无法回答。”，不要输出额外内容。
+
+    问题: {question}
+    可参考的上下文： 
+    ··· 
+    {context}
+    ···
+    简明准确的回答：
+    """,
 
 )
 
@@ -153,8 +165,10 @@ def chat(self, prompt: str, history: List = [], content: str = '', llm_only: boo
         if llm_only:
             prompt = prompt
         else:
-            prompt = PROMPT_TEMPLATE['Xunfei_PROMPT_TEMPLATE'].format(question=prompt, context=content)
+            prompt = PROMPT_TEMPLATE['Xunfei_PROMPT_TEMPLATE2'].format(question=prompt, context=content)
+        prompt=prompt.encode("utf-8", 'ignore').decode('utf-8','ignore')
         print(prompt)
+
         inputs = self.tokenizer.apply_chat_template([{"role": "user", "content": prompt}],
                                                     add_generation_prompt=True,
                                                     tokenize=True,
@@ -163,7 +177,7 @@ def chat(self, prompt: str, history: List = [], content: str = '', llm_only: boo
                                                     )
 
         inputs = inputs.to('cuda')
-        gen_kwargs = {"max_length": 16000, "do_sample": False, "top_k": 1}
+        gen_kwargs = {"max_length": 20000, "do_sample": False, "top_k": 1}
         with torch.no_grad():
             outputs = self.model.generate(**inputs, **gen_kwargs)
             outputs = outputs[:, inputs['input_ids'].shape[1]:]

diff --git a/requirements.txt b/requirements.txt
@@ -1,47 +1,47 @@
-tqdm >= 4.23.4
-hyperopt >= 0.1.1
-h5py >= 2.8.0
-coverage >= 4.3.4
-codecov >= 2.0.15
-pytest >= 3.7.4
-pytest-cov >= 2.4.0
-flake8 == 7.0.0
-flake8_docstrings == 1.7.0
-pydocstyle == 2.1
-openai == 1.33.0
-datasets == 2.16.1
-langchain==0.2.0
-langchain-core==0.2.5
-langchain-text-splitters==0.2.1
-langchain-huggingface==0.0.3
-langchain-openai==0.1.8
-langsmith>=0.1.61
+tqdm
+hyperopt
+h5py
+coverage
+codecov
+pytest
+pytest-cov
+flake8
+flake8_docstrings
+pydocstyle
+openai
+datasets
+langchain
+langchain-core
+langchain-text-splitters
+langchain-huggingface
+langchain-openai
+langsmith
 transformers
 pandas
-nltk == 3.8.1
-sentencepiece==0.2.0
-PyPDF2==3.0.1
-html2text==2024.2.26
-beautifulsoup4==4.12.3
-faiss-cpu==1.8.0
-umap-learn==0.5.5
-sentence_transformers==3.0.0
-threadpoolctl==3.5.0
-PyMuPDF==1.24.5
-hanziconv==0.3.2
-datrie==0.8.2
-xpinyin==0.7.6
-python-pptx==0.6.23
-pdfplumber==0.11.0
-readability==0.3.1
-html_text==0.6.2
-python-docx==1.1.2
-tortoise==0.1.1
-python-magic==0.4.27
-html_text==0.6.2
-readability==0.3.1
+nltk
+sentencepiece
+PyPDF2
+html2text
+beautifulsoup4
+faiss-cpu
+umap-learn
+sentence_transformers
+threadpoolctl
+PyMuPDF
+hanziconv
+datrie
+xpinyin
+python-pptx
+pdfplumber
+readability
+html_text
+python-docx
+tortoise
+python-magic
+html_text
+readability
 PyMuPDF
-hanziconv==0.3.2
+hanziconv
 PyPDF2
 gradio
 loguru