-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingest.py
164 lines (140 loc) · 6.62 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Character splitters used to split text data in your pdf/csv/etc
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
# Vector Store/Databases
from langchain_community.vectorstores import FAISS, Qdrant
# Embeddings creators
from langchain_community.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
# Document Loaders
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import UnstructuredFileLoader
class FAISSVectorStores:
def __init__(self,
dataset_path="./data",
dataset_type="pdf",
text_splitter="recursive",
embeddings_model='sentence-transformers/all-MiniLM-L6-v2',
faiss_db_path="./faiss_vector_store",
chunk_size=500,
chunk_overlap=50,
device="cuda",
use_multithreading=True):
self.__dataset_path = dataset_path
self.__dataset_type = dataset_type
self.__text_splitter = text_splitter
self.__embeddings_model = embeddings_model
self.__faiss_db_path = faiss_db_path
self.__chunk_size = chunk_size
self.__chunk_overlap = chunk_overlap
self.__device = device
self.__use_multithreading = use_multithreading
def create_vector_store(self):
if self.__dataset_type == "pdf":
loader = DirectoryLoader(self.__dataset_path, glob='*.pdf',
loader_cls=PyPDFLoader, show_progress=True,
use_multithreading=self.__use_multithreading)
elif self.__dataset_type == "csv":
loader = DirectoryLoader(self.__dataset_path, glob='*.csv',
loader_cls=CSVLoader, show_progress=True,
use_multithreading=self.__use_multithreading)
elif self.__dataset_type == "md":
loader = DirectoryLoader(self.__dataset_path, glob='*.md',
show_progress=True, use_multithreading=self.__use_multithreading)
else:
loader = DirectoryLoader(self.__dataset_path, glob='*.*',
loader_cls=UnstructuredFileLoader, show_progress=True,
use_multithreading=self.__use_multithreading)
documents = loader.load()
if self.__text_splitter == "recursive":
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.__chunk_size,
chunk_overlap = self.__chunk_overlap)
else:
text_splitter = CharacterTextSplitter(chunk_size=self.__chunk_size,
chunk_overlap = self.__chunk_overlap)
texts = text_splitter.split_documents(documents)
try:
embeddings = HuggingFaceEmbeddings(model_name=self.__embeddings_model,
model_kwargs={'device' : self.__device})
print("Embeddings generated successfully!")
except Exception as e:
print("Error while generating Embeddings. Error : ", e)
try:
db = FAISS.from_documents(texts, embeddings)
db.save_local(self.__faiss_db_path)
print("Embeddings stored to FAISS successfully!")
except Exception as e:
print("Errow while storing vectors to FAISS. Error : ", e)
class QdrantVectorDatabase:
'''
Steps to setup :-
1. docker pull qdrant/qdrant
2. docker run -p 6333:6333 -p 6334:6334 -v {PATH_TO_YOUR_DIRECTORY}:/qdrant/storage:z qdrant/qdrant
'''
def __init__(self,
dataset_path="./data",
dataset_type="pdf",
text_splitter="recursive",
embeddings_model='sentence-transformers/all-MiniLM-L6-v2',
chunk_size=500,
chunk_overlap=50,
device="cuda",
qdrant_url = "http://localhost:6333",
name_of_db = "sample_vector_db",
use_multithreading=True):
self.__dataset_path = dataset_path
self.__dataset_type = dataset_type
self.__text_splitter = text_splitter
self.__embeddings_model = embeddings_model
self.__chunk_size = chunk_size
self.__chunk_overlap = chunk_overlap
self.__device = device
self.__qdrant_url = qdrant_url
self.__collection_name = name_of_db
self.__use_multithreading = use_multithreading
def create_vector_database(self):
if self.__dataset_type == "pdf":
loader = DirectoryLoader(self.__dataset_path, glob='*.pdf',
loader_cls=PyPDFLoader, show_progress=True,
use_multithreading=self.__use_multithreading)
elif self.__dataset_type == "csv":
loader = DirectoryLoader(self.__dataset_path, glob='*.csv',
loader_cls=CSVLoader, show_progress=True,
use_multithreading=self.__use_multithreading)
elif self.__dataset_type == "md":
loader = DirectoryLoader(self.__dataset_path, glob='*.md',
show_progress=True, use_multithreading=self.__use_multithreading)
else:
loader = DirectoryLoader(self.__dataset_path, glob='*.*',
loader_cls=UnstructuredFileLoader, show_progress=True,
use_multithreading=self.__use_multithreading)
documents = loader.load()
if self.__text_splitter == "recursive":
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.__chunk_size,
chunk_overlap = self.__chunk_overlap)
else:
text_splitter = CharacterTextSplitter(chunk_size=self.__chunk_size,
chunk_overlap = self.__chunk_overlap)
texts = text_splitter.split_documents(documents)
embeddings = SentenceTransformerEmbeddings(model_name=self.__embeddings_model)
try:
qdrant = Qdrant.from_documents(
texts,
embeddings,
url = self.__qdrant_url,
prefer_grpc = False,
collection_name = self.__collection_name
)
print("Vectors created!")
except Exception as e:
print("Counldn't create Qdrant Database!")
print("Exception : ", e)
return 0
if __name__ == "__main__":
# Create object of class FAISS Vector Store
faiss_vector_store = FAISSVectorStores()
# Call this function to create and store embeddings
faiss_vector_store.create_vector_store()
# Create object of Qdrant DB class
# qdrant_vector_database = QdrantVectorDatabase()
# Call this function to create and store embeddings
# qdrant_vector_database.create_vector_database()