-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrag_with_llama_2_and_langchain_update.py
266 lines (205 loc) · 10.7 KB
/
rag_with_llama_2_and_langchain_update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# -*- coding: utf-8 -*-
"""RAG_with_Llama_2_and_LangChain_update.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/18AcG3w4a8Ovoi5rElV4AVZ5oDFvMtYjm
# RAG with Llama 2 and LangChain
Retrieval-Augmented Generation (RAG) is a technique that combines a retriever and a generative language model to deliver accurate response. It involves retrieving relevant information from a large corpus and then generating contextually appropriate responses to queries. Here we use the quantized version of the Llama 2 13B LLM with LangChain to perform generative QA with RAG. The notebook file has been tested in Google Colab with T4 GPU. Please change the runtime type to T4 GPU before running the notebook.
## Install Packages
"""
!pip install transformers==4.37.2 optimum==1.12.0 --quiet
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ --quiet
!pip install langchain==0.1.9 --quiet
# !pip install chromadb
!pip install sentence_transformers==2.4.0 --quiet
!pip install unstructured --quiet
!pip install pdf2image --quiet
!pip install pdfminer.six==20221105 --quiet
!pip install unstructured-inference --quiet
!pip install faiss-gpu==1.7.2 --quiet
!pip install pikepdf==8.13.0 --quiet
!pip install pypdf==4.0.2 --quiet
!pip install pillow_heif==0.15.0 --quiet
"""## Restart Runtime
## Load Llama 2
We will use the quantized version of the LLAMA 2 13B model from HuggingFace for our RAG task.
"""
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
model_name = "TheBloke/Llama-2-13b-Chat-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name,
device_map="auto",
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
gen_cfg = GenerationConfig.from_pretrained(model_name)
gen_cfg.max_new_tokens=512
gen_cfg.temperature=0.0000001 # 0.0
gen_cfg.return_full_text=True
gen_cfg.do_sample=True
gen_cfg.repetition_penalty=1.11
pipe=pipeline(
task="text-generation",
model=model,
tokenizer=tokenizer,
generation_config=gen_cfg
)
llm = HuggingFacePipeline(pipeline=pipe)
"""#### Test LLM with Llama 2 prompt structure and LangChain PromptTemplate"""
from textwrap import fill
from langchain.prompts import PromptTemplate
template = """
<s>[INST] <<SYS>>
You are an AI assistant. You are truthful, unbiased and honest in your response.
If you are unsure about an answer, truthfully say "I don't know"
<</SYS>>
{text} [/INST]
"""
prompt = PromptTemplate(
input_variables=["text"],
template=template,
)
text = "Explain artificial intelligence in a few lines"
result = llm.invoke(prompt.format(text=text))
print(fill(result.strip(), width=100))
import locale
locale.getpreferredencoding = lambda: "UTF-8"
"""## RAG from PDF Files
### A. Create a vectore store for the context/external data
Here, we'll create embedding vectores of the unstructured data loaded from the the source and store them in a vectore store.
####Download pdf files
"""
!gdown "https://github.com/muntasirhsn/datasets/raw/main/Solar-System-Wikipedia.pdf" # this is just a pdf print of the Solar System page on Wikipedia!
"""####Load PDF Files
Depending on the type of the source data, we can use the appropriate data loader from LangChain to load the data.
"""
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores.utils import filter_complex_metadata # 'filter_complex_metadata' removes complex metadata that are not in str, int, float or bool format
pdf_loader = UnstructuredPDFLoader("/content/Solar-System-Wikipedia.pdf")
pdf_doc = pdf_loader.load()
updated_pdf_doc = filter_complex_metadata(pdf_doc)
"""#### Spit the document into chunks
Due to the limited size of the context window of an LLM, the data need to be divided into smaller chunks with a text splitter like CharacterTextSplitter or RecursiveCharacterTextSplitter. In this way, the smaller chunks can be fed into the LLM.
"""
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
chunked_pdf_doc = text_splitter.split_documents(updated_pdf_doc)
len(chunked_pdf_doc)
"""#### Create a vector database of the chunked documents with HuggingFace embeddings"""
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()
"""We can either use FAISS or Chroma to create the [Vector Store](https://python.langchain.com/docs/modules/data_connection/vectorstores.html)."""
!pip install Chromadb
# Commented out IPython magic to ensure Python compatibility.
# %%time
# # # Create the vectorized db with FAISS
# # from langchain.vectorstores import FAISS
# # db_pdf = FAISS.from_documents(chunked_pdf_doc, embeddings)
#
# # Create the vectorized db with Chroma
# from langchain.vectorstores import Chroma
# db_pdf = Chroma.from_documents(chunked_pdf_doc, embeddings)
"""### B. Use RetrievalQA chain
We instantiate a RetrievalQA chain from LangChain which takes in a retriever, LLM and a chain_type as the input arguments. When the QA chain receives a query, the retriever retrieves information relevent to the query from the vectore store. The ``chain type = "stuff"`` method stuffs all the retrieved information into context and makes a call to the language model. The LLM then generates the text/response from the retrieved documents. [See information on Langchain Retriver](https://python.langchain.com/docs/use_cases/question_answering/vector_db_qa).
**LLM prompt structure**
We can also pass in the recommended prompt structue for Llama 2 for the QA. In this way, we'd be able to advise our LLM to only use the available context to answer our question. If it cannot find information relevant to our query in the context, it'll **NOT** make up an answer, rather, it would advise that it's unable to find relevant information in the context.
"""
# Commented out IPython magic to ensure Python compatibility.
# %%time
# from langchain.prompts import PromptTemplate
# from langchain.chains import RetrievalQA
#
# # use the recommended propt style for the LLAMA 2 LLM
# prompt_template = """
# <s>[INST] <<SYS>>
# Use the following context to Answer the question at the end. Do not use any other information. If you can't find the relevant information in the context, just say you don't have enough information to answer the question. Don't try to make up an answer.
#
# <</SYS>>
#
# {context}
#
# Question: {question} [/INST]
# """
#
# prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
# Chain_pdf = RetrievalQA.from_chain_type(
# llm=llm,
# chain_type="stuff",
# # retriever=db.as_retriever(search_type="similarity_score_threshold", search_kwargs={'k': 5, 'score_threshold': 0.8})
# # Similarity Search is the default way to retrieve documents relevant to a query, but we can use MMR by setting search_type = "mmr"
# # k defines how many documents are returned; defaults to 4.
# # score_threshold allows to set a minimum relevance for documents returned by the retriever, if we are using the "similarity_score_threshold" search type.
# # return_source_documents=True, # Optional parameter, returns the source documents used to answer the question
# retriever=db_pdf.as_retriever(), # (search_kwargs={'k': 5, 'score_threshold': 0.8}),
# chain_type_kwargs={"prompt": prompt},
# )
# query = "When was the solar system formed?"
# result = Chain_pdf.invoke(query)
# print(fill(result['result'].strip(), width=100))
# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "Explain in detail how the solar system was formed."
# result = Chain_pdf.invoke(query)
# print(fill(result['result'].strip(), width=100))
# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "What are the planets of the solar system composed of? Give a detailed response."
# result = Chain_pdf.invoke(query)
# print(fill(result['result'].strip(), width=100))
"""### C. Hallucination Check
Hallucination in RAG refers to the generation of content by an LLM that is not based onn the retrieved knowledge.
Let's test our LLM with a query that is not relevant to the context. The model should respond that it does not have enough information to respond to this query.
"""
# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "How does the tranformers architecture work?"
# result = Chain_pdf.invoke(query)
# print(fill(result['result'].strip(), width=100))
"""The model responded as expected. The context provided to it do not contain any information on tranformers architectures. So, it cannot answer this question and do not suffer from hallucination!
## RAG from web pages
####Load the document
"""
from langchain.document_loaders import UnstructuredURLLoader
web_loader = UnstructuredURLLoader(
urls=["https://en.wikipedia.org/wiki/Solar_System"], mode="elements", strategy="fast",
)
web_doc = web_loader.load()
updated_web_doc = filter_complex_metadata(web_doc)
"""####Split the documents into chunks"""
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
chunked_web_doc = text_splitter.split_documents(updated_web_doc)
len(chunked_web_doc)
"""#### Create a vector database of the chunked documents with HuggingFace embeddings"""
# Commented out IPython magic to ensure Python compatibility.
# %%time
# # Create the vectorized db with FAISS
# db_web = FAISS.from_documents(chunked_web_doc, embeddings)
"""#### RAG with RetrievalQA"""
# Commented out IPython magic to ensure Python compatibility.
# %%time
# Chain_web = RetrievalQA.from_chain_type(
# llm=llm,
# chain_type="stuff",
# retriever=db_web.as_retriever(),
# chain_type_kwargs={"prompt": prompt},
# )
# query = "When was the solar system formed?"
# result = Chain_web.invoke(query)
# print(fill(result['result'].strip(), width=100))
# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "Explain in detail how the solar system was formed."
# result = Chain_web.invoke(query)
# print(fill(result['result'].strip(), width=100))
# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "What are the planets of the solar system composed of? Give a detailed response."
# result = Chain_web.invoke(query)
# print(fill(result['result'].strip(), width=100))
"""#### Hallucination Check"""
# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "How does the tranformers architecture work?"
# result = Chain_web.invoke(query)
# print(fill(result['result'].strip(), width=100))
"""The model does not suffer from hallucination!"""