Skip to content

Commit

Permalink
feat: initial commit of the RAG script and job
Browse files Browse the repository at this point in the history
  • Loading branch information
cindyli committed Apr 18, 2024
1 parent 217c50f commit a00b7df
Show file tree
Hide file tree
Showing 2 changed files with 182 additions and 0 deletions.
26 changes: 26 additions & 0 deletions jobs/mistral/RAG/job_rag.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
#SBATCH --job-name=RAG-Mistral-7B-Instruct-v0.2
#SBATCH --time 2-00:00
#SBATCH --nodes=1
#SBATCH --gpus-per-node=v100l:1
#SBATCH --mem=64G
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=4
#SBATCH --account=def-whkchun
#SBATCH --output=%x.o%j

module load StdEnv/2023
module load python/3.11.5

virtualenv --no-download $SLURM_TMPDIR/env
source $SLURM_TMPDIR/env/bin/activate

pip install --upgrade pip

module load StdEnv/2023 rust/1.70.0 arrow/15.0.1 gcc/12.3 cudacore/.12.2.2

pip install --no-index torch transformers tensorflow sentence_transformers accelerate==0.25.0 peft==0.5.0 bitsandbytes==0.42.0 datasets==2.17.0 trl
pip install langchain langchain-community unstructured chromadb

echo "=== Use RAG with Mistral-7b-instruct-1.0 from job $SLURM_JOB_ID on nodes $SLURM_JOB_NODELIST."
python ~/mistral/RAG/rag.py
156 changes: 156 additions & 0 deletions jobs/mistral/RAG/rag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
pipeline
)

from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain_community.vectorstores import Chroma

# The location of the user document
user_files = ["/home/cindyli/mistral/RAG/data/user_doc.txt"]

# The local directory with the model and the tokenizer
model_dir = "/home/cindyli/projects/ctb-whkchun/s2_bliss_LLMs/Mistral-7B-Instruct-v0.2"

# The local directory with the sentence transformer
sentence_transformer_dir = "/home/cindyli/projects/ctb-whkchun/s2_bliss_LLMs/all-MiniLM-L6-v2"

# DB directory
# db_directory = "chroma_local_db"

# Output directory where the model checkpoints will be stored
output_dir = "~/projects/ctb-whkchun/s2_bliss/results-rag-mistral"

################################################################################
# Load user documents, split into small pieces and create embeddings for them
################################################################################

# Load user documents
loader = UnstructuredFileLoader(user_files, mode="elements")
raw_documents = loader.load()
print(f"Loaded documents (first 10 rows):\n{raw_documents[:10]}")

# split into small pieces
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200)
splitted_documents = text_splitter.split_documents(raw_documents)
print(f"Splitted documents (first 10 rows):\n{splitted_documents[:10]}")

# Instantiate the embedding class
embedding_func = HuggingFaceEmbeddings(model_name=sentence_transformer_dir)

# Load into Chroma, the vector database
vectordb = Chroma.from_documents(splitted_documents, embedding_func)

# query the vector db to test
queries = [
"What's the name of Elaine's nephew?",
"What schools did Elaine attend?"]

for query in queries:
result = vectordb.similarity_search(query)
print(f"Similarity search in Chroma returns for {query}:\n{result[0].page_content}")

# Create a vector store retriever
retriever = vectordb.as_retriever()

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16: accelerate training with bf16=True")
print("=" * 80)

#################################################################
# Load tokenizer and model
#################################################################

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load base model
model = AutoModelForCausalLM.from_pretrained(
model_dir,
local_files_only=True,
quantization_config=bnb_config
)

text_generation_pipeline = pipeline(
model=model,
tokenizer=tokenizer,
task="text-generation",
temperature=0.2,
repetition_penalty=1.1,
return_full_text=True,
max_new_tokens=1000,
)

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

#################################################################
# Create RAG chain
#################################################################

# Create prompt template
prompt_template = """
### [INST] Instruction: Elaine is an AAC user who expresses herself telegraphically. She is now in a conversation with Jutta. Below is the conversation in the meeting. Please help to convert what Elaine said to first-person sentences. Only respond with converted sentences. Here is context to help:
{context}
### QUESTION:
{question} [/INST]
"""

# Create prompt from prompt template
prompt = PromptTemplate(
input_variables=["context", "question"],
template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

rag_chain = ({"context": retriever, "question": RunnablePassthrough()} | llm_chain)

rag_chain.invoke("Jutta: Who would you like to invite to your birthday party?\nElaine: Roy, nephew.")

0 comments on commit a00b7df

Please sign in to comment.