retriever.py

# -*- coding: utf-8 -*-
"""read_pdf.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1JW_t2RFggrcX473_q2VxsrlHu5jem00W
"""

# !pip install langchain
#
# !pip install pypdfium2
#
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple/ PyMuPDF
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple/ tqdm
# !pip install  -i https://pypi.tuna.tsinghua.edu.cn/simple/ beautifulsoup4
#
# !pip install sentence_transformers faiss-gpu

from sentence_transformers import SentenceTransformer
def embed_sentences(sentences,model):
  sentence_embeddings = model.encode(sentences)
  return sentence_embeddings
import time
n_dim = 1024
model_name = 'BAAI/bge-large-zh'
model = SentenceTransformer(model_name,device='cuda')
embedding_dim = 1024

# print(split_documents[0])


# cost,res = do_faiss_lookup(fast_index,"什么时候应该为车辆打蜡？",model,2)
# print([contents[t].replace('........','') for t in res])

import fitz
from tqdm import tqdm
from bs4 import BeautifulSoup
import re

#将pdf转成html
def pdf2html(input_path,html_path):
    doc = fitz.open(input_path)
    print(doc)
    html_content =''
    idx = 0
    for page in tqdm(doc):
        idx+=1
        if idx<10:
          print(page.get_text('html'))
        html_content += page.get_text('html')
    print('开始输出html文件')
    html_content +="</body></html>"
    with open(html_path, 'w', encoding = 'utf-8', newline='')as fp:
        fp.write(html_content)

#使用Beautifulsoup解析本地html
def html2txt(html_path):
    html_file = open(html_path, 'r', encoding = 'utf-8')
    htmlhandle = html_file.read()
    soup = BeautifulSoup(htmlhandle, "html.parser")
    for div in soup.find_all('div'):
      for p in div:
        text = str()
        for span in p:
            p_info = '<span .*?>(.*?)</span>'   #提取规则
            res = re.findall(p_info,str(span))  #findall函数
            if len(res) ==0:
                pass
            else:
                text+= res[0]  #将列表中的字符串内容合并加到行字符串中
        print(text)
        with open("data.txt",'a',encoding = 'utf-8')as text_file:
            text_file.write(text)
            text_file.write('\n')

input_path = r'test.pdf'
html_path = 'input.html'
pdf2html(input_path,html_path )  #pdf转html
html2txt(html_path)  #解析html保存为txt
from langchain.document_loaders import PyPDFium2Loader
from langchain.document_loaders import TextLoader
content = ''.join([t.replace('\r\n','').replace('\n','') for t in open('data.txt','r',encoding='utf-8').readlines()])

from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
split_documents = text_splitter.split_text(content)
print(len(split_documents))
import numpy as np
import faiss
def do_faiss_lookup(fastIndex, query_text, model, top_k):
#     n_embeddings = sentence_embeddings.shape[0]

    embedding_q = np.reshape(model.encode(query_text),[1,embedding_dim])
    #let it be float32
    embedding_q = embedding_q.astype('float32')
    faiss.normalize_L2(embedding_q)
    st = time.time()
    matched_em, matched_indexes = fastIndex.search(embedding_q, top_k) # it returns matched vectors and thier respective indexes, we are interested only in indexes.

    #indexes are already sorted wrt to closest match
    et = time.time()
    return matched_em, matched_indexes[0]


def create_index(contents):
    sentence_embeddings = embed_sentences(contents,model)
    n_dimensions = sentence_embeddings.shape[1] #Number of dimensions (764)
    fastIndex = faiss.IndexFlatIP(n_dimensions) # We will create an index of type FlatL2, there are many kinds of indexes, you can look at it in their repo.
    sentence_embeddings =sentence_embeddings.astype('float32')
    faiss.normalize_L2(sentence_embeddings)
    fastIndex.add(sentence_embeddings)
    return fastIndex,sentence_embeddings
contents = [p for p in split_documents]
fast_index,sentence_embeddings = create_index(contents)

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

rerank_tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
rerank_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
rerank_model.eval()

def rerank_passages(pairs):
  passages = [p[1] for p in pairs]
  with torch.no_grad():
    inputs = rerank_tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
  dic = {p:score for p,score in zip(passages,scores)}
  sorted_passages = sorted(dic.items(),key=lambda x:x[1],reverse=True)
  return [w[0] for w in sorted_passages]

import json
import sys
from tqdm import tqdm
test_data = json.load(open('test_tianchi.json','r',encoding='utf-8'))
result = []
for d in tqdm(test_data):
  cost,res = do_faiss_lookup(fast_index,d['question'],model,5)
  d['contents'] = [contents[t].replace('........','') for t in res]
  passages = rerank_passages([[d['question'],c] for c in d['contents']])
  # d['contents'] = passages
  # result.append(d)
  json_data = {'question':d['question'],'answer_1':passages[0],'answer_2':passages[1] if len(passages)>1 else '','answer_3':passages[2] if len(passages)>2 else '','contents':passages}
  result.append(json_data)
  # print(len(passages))
  # sys.exit(1)

json.dump(result,open('test_rerank_1024_top5.json','w',encoding='utf-8'),ensure_ascii=False,indent=4)