rag_with_llama_2_and_langchain_update.py

# -*- coding: utf-8 -*-
"""RAG_with_Llama_2_and_LangChain_update.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/18AcG3w4a8Ovoi5rElV4AVZ5oDFvMtYjm

# RAG with Llama 2 and LangChain
Retrieval-Augmented Generation (RAG) is a technique that combines a retriever and a generative language model to deliver accurate response. It involves retrieving relevant information from a large corpus and then generating contextually appropriate responses to queries. Here we use the quantized version of the Llama 2 13B LLM with LangChain to perform generative QA with RAG. The notebook file has been tested in Google Colab with T4 GPU. Please change the runtime type to T4 GPU before running the notebook.

## Install Packages
"""

!pip install transformers==4.37.2 optimum==1.12.0 --quiet
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ --quiet
!pip install langchain==0.1.9 --quiet
# !pip install chromadb
!pip install sentence_transformers==2.4.0 --quiet
!pip install unstructured --quiet
!pip install pdf2image --quiet
!pip install pdfminer.six==20221105 --quiet
!pip install unstructured-inference --quiet
!pip install faiss-gpu==1.7.2 --quiet
!pip install pikepdf==8.13.0 --quiet
!pip install pypdf==4.0.2 --quiet
!pip install pillow_heif==0.15.0 --quiet

"""## Restart Runtime

## Load Llama 2
We will use the quantized version of the LLAMA 2 13B model from HuggingFace for our RAG task.
"""

from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

model_name = "TheBloke/Llama-2-13b-Chat-GPTQ"

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

gen_cfg = GenerationConfig.from_pretrained(model_name)
gen_cfg.max_new_tokens=512
gen_cfg.temperature=0.0000001 # 0.0
gen_cfg.return_full_text=True
gen_cfg.do_sample=True
gen_cfg.repetition_penalty=1.11

pipe=pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    generation_config=gen_cfg
)

llm = HuggingFacePipeline(pipeline=pipe)

"""#### Test LLM with Llama 2 prompt structure and LangChain PromptTemplate"""

from textwrap import fill
from langchain.prompts import PromptTemplate

template = """
<s>[INST] <<SYS>>
You are an AI assistant. You are truthful, unbiased and honest in your response.

If you are unsure about an answer, truthfully say "I don't know"
<</SYS>>

{text} [/INST]
"""

prompt = PromptTemplate(
    input_variables=["text"],
    template=template,
)

text = "Explain artificial intelligence in a few lines"
result = llm.invoke(prompt.format(text=text))
print(fill(result.strip(), width=100))

import locale
locale.getpreferredencoding = lambda: "UTF-8"

"""## RAG from PDF Files
### A. Create a vectore store for the context/external data
Here, we'll create embedding vectores of the unstructured data loaded from the the source and store them in a vectore store.

####Download pdf files
"""

!gdown "https://github.com/muntasirhsn/datasets/raw/main/Solar-System-Wikipedia.pdf" # this is just a pdf print of the Solar System page on Wikipedia!

"""####Load PDF Files
Depending on the type of the source data, we can use the appropriate data loader from LangChain to load the data.
"""

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores.utils import filter_complex_metadata # 'filter_complex_metadata' removes complex metadata that are not in str, int, float or bool format

pdf_loader = UnstructuredPDFLoader("/content/Solar-System-Wikipedia.pdf")
pdf_doc = pdf_loader.load()
updated_pdf_doc = filter_complex_metadata(pdf_doc)

"""#### Spit the document into chunks
Due to the limited size of the context window of an LLM, the data need to be divided into smaller chunks with a text splitter like CharacterTextSplitter or RecursiveCharacterTextSplitter. In this way, the smaller chunks can be fed into the LLM.
"""

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
chunked_pdf_doc = text_splitter.split_documents(updated_pdf_doc)
len(chunked_pdf_doc)

"""#### Create a vector database of the chunked documents with HuggingFace embeddings"""

from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

"""We can either use FAISS or Chroma to create the [Vector Store](https://python.langchain.com/docs/modules/data_connection/vectorstores.html)."""

!pip install Chromadb

# Commented out IPython magic to ensure Python compatibility.
# %%time
# # # Create the vectorized db with FAISS
# # from langchain.vectorstores import FAISS
# # db_pdf = FAISS.from_documents(chunked_pdf_doc, embeddings)
# 
# # Create the vectorized db with Chroma
# from langchain.vectorstores import Chroma
# db_pdf = Chroma.from_documents(chunked_pdf_doc, embeddings)

"""### B. Use RetrievalQA chain
We instantiate a RetrievalQA chain from LangChain which takes in a retriever, LLM and a chain_type as the input arguments. When the QA chain receives a query, the retriever retrieves information relevent to the query from the vectore store.   The ``chain type = "stuff"`` method stuffs all the retrieved information into context and makes a call to the language model. The LLM then generates the text/response from the retrieved documents. [See information on Langchain Retriver](https://python.langchain.com/docs/use_cases/question_answering/vector_db_qa).

**LLM prompt structure**

We can also pass in the recommended prompt structue for Llama 2 for the QA. In this way, we'd be able to advise our LLM to only use the available context to answer our question. If it cannot find information relevant to our query in the context, it'll **NOT** make up an answer, rather, it would advise that it's unable to find relevant information in the context.
"""

# Commented out IPython magic to ensure Python compatibility.
# %%time
# from langchain.prompts import PromptTemplate
# from langchain.chains import RetrievalQA
# 
# # use the recommended propt style for the LLAMA 2 LLM
# prompt_template = """
# <s>[INST] <<SYS>>
# Use the following context to Answer the question at the end. Do not use any other information. If you can't find the relevant information in the context, just say you don't have enough information to answer the question. Don't try to make up an answer.
# 
# <</SYS>>
# 
# {context}
# 
# Question: {question} [/INST]
# """
# 
# prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
# Chain_pdf = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     # retriever=db.as_retriever(search_type="similarity_score_threshold", search_kwargs={'k': 5, 'score_threshold': 0.8})
#     # Similarity Search is the default way to retrieve documents relevant to a query, but we can use MMR by setting search_type = "mmr"
#     # k defines how many documents are returned; defaults to 4.
#     # score_threshold allows to set a minimum relevance for documents returned by the retriever, if we are using the "similarity_score_threshold" search type.
#     # return_source_documents=True, # Optional parameter, returns the source documents used to answer the question
#     retriever=db_pdf.as_retriever(), # (search_kwargs={'k': 5, 'score_threshold': 0.8}),
#     chain_type_kwargs={"prompt": prompt},
# )
# query = "When was the solar system formed?"
# result = Chain_pdf.invoke(query)
# print(fill(result['result'].strip(), width=100))

# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "Explain in detail how the solar system was formed."
# result = Chain_pdf.invoke(query)
# print(fill(result['result'].strip(), width=100))

# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "What are the planets of the solar system composed of? Give a detailed response."
# result = Chain_pdf.invoke(query)
# print(fill(result['result'].strip(), width=100))

"""### C. Hallucination Check
Hallucination in RAG refers to the generation of content by an LLM that is not based onn the retrieved knowledge.

Let's test our LLM with a query that is not relevant to the context. The model should respond that it does not have enough information to respond to this query.
"""

# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "How does the tranformers architecture work?"
# result = Chain_pdf.invoke(query)
# print(fill(result['result'].strip(), width=100))

"""The model responded as expected. The context provided to it do not contain any information on tranformers architectures. So, it cannot answer this question and do not suffer from hallucination!

## RAG from web pages

####Load the document
"""

from langchain.document_loaders import UnstructuredURLLoader

web_loader = UnstructuredURLLoader(
    urls=["https://en.wikipedia.org/wiki/Solar_System"], mode="elements", strategy="fast",
    )
web_doc = web_loader.load()
updated_web_doc = filter_complex_metadata(web_doc)

"""####Split the documents into chunks"""

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
chunked_web_doc = text_splitter.split_documents(updated_web_doc)
len(chunked_web_doc)

"""#### Create a vector database of the chunked documents with HuggingFace embeddings"""

# Commented out IPython magic to ensure Python compatibility.
# %%time
# # Create the vectorized db with FAISS
# db_web = FAISS.from_documents(chunked_web_doc, embeddings)

"""#### RAG with RetrievalQA"""

# Commented out IPython magic to ensure Python compatibility.
# %%time
# Chain_web = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=db_web.as_retriever(),
#     chain_type_kwargs={"prompt": prompt},
# )
# query = "When was the solar system formed?"
# result = Chain_web.invoke(query)
# print(fill(result['result'].strip(), width=100))

# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "Explain in detail how the solar system was formed."
# result = Chain_web.invoke(query)
# print(fill(result['result'].strip(), width=100))

# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "What are the planets of the solar system composed of? Give a detailed response."
# result = Chain_web.invoke(query)
# print(fill(result['result'].strip(), width=100))

"""#### Hallucination Check"""

# Commented out IPython magic to ensure Python compatibility.
# %%time
# query = "How does the tranformers architecture work?"
# result = Chain_web.invoke(query)
# print(fill(result['result'].strip(), width=100))

"""The model does not suffer from hallucination!"""