Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated notebook #3

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
SOFTWARE.
180 changes: 180 additions & 0 deletions data/QnA/good_question.csv

Large diffs are not rendered by default.

230 changes: 230 additions & 0 deletions data/document_test/chunk_slice.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions index/graph_store.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"graph_dict": {}}
1 change: 1 addition & 0 deletions index/image__vector_store.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"embedding_dict": {}, "text_id_to_ref_doc_id": {}, "metadata_dict": {}}
rngadam marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions index/index_store.json

Large diffs are not rendered by default.

162 changes: 162 additions & 0 deletions notebooks/llamaindex-load.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install llama-index==0.9.36\n",
"!pip install asyncpg==0.29.0\n",
"%pip install openai --upgrade\n",
"dbutils.library.restartPython()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.llms import AzureOpenAI\n",
"from llama_index.embeddings import AzureOpenAIEmbedding\n",
"from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
"import logging\n",
"import sys\n",
"from collections.abc import Iterator\n",
"from llama_index import ServiceContext, SimpleDirectoryReader, StorageContext\n",
"from llama_index.indices.vector_store import VectorStoreIndex\n",
"from llama_index.vector_stores import PGVectorStore\n",
"import textwrap\n",
"import openai\n",
"from llama_index import download_loader\n",
"# customize textnode - purpose is to add id to each node\n",
"from llama_index.schema import TextNode\n",
"# customize stages of querying https://docs.llamaindex.ai/en/latest/understanding/querying/querying.html\n",
"from llama_index import get_response_synthesizer\n",
"from llama_index.retrievers import VectorIndexRetriever\n",
"from llama_index.query_engine import RetrieverQueryEngine\n",
"from llama_index.postprocessor import SimilarityPostprocessor\n",
"from llama_index import StorageContext, load_index_from_storage\n",
"\n",
"logging.basicConfig(\n",
" stream=sys.stdout, level=logging.INFO\n",
") # logging.DEBUG for more verbose output\n",
"logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Download query engine from Azure Blob Storage Container\n",
"from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient\n",
"\n",
"# Your storage account connection string\n",
"connection_string = \"DefaultEndpointsProtocol=https;AccountName=finessetestblobstorage;AccountKey=;EndpointSuffix=core.windows.net\"\n",
"\n",
"# The name of your container\n",
"container_name = \"llamaindex-v1\"\n",
"\n",
"# The name of the virtual folder you want to list files from\n",
"folder_name = \"index\"\n",
"\n",
"# Initialize the BlobServiceClient\n",
"blob_service_client = BlobServiceClient.from_connection_string(connection_string)\n",
"\n",
"# Get the container client\n",
"container_client = blob_service_client.get_container_client(container_name)\n",
"\n",
"# List all blobs in the specified folder\n",
"blobs_list = container_client.list_blobs(name_starts_with=folder_name)\n",
"\n",
"# List all blobs in the container (at the root)\n",
"blobs_list = container_client.list_blobs()\n",
"\n",
"for blob in blobs_list:\n",
" print(\"Blob name: \" + blob.name)\n",
" blob_name = blob.name\n",
" blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)\n",
" # Download the blob to a local file\n",
" download_file_path = \"./index/\" + blob_name\n",
" with open(download_file_path, \"wb\") as download_file:\n",
" download_file.write(blob_client.download_blob().readall())\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load query engine from local folder, 'index'\n",
"from llama_index import StorageContext, load_index_from_storage\n",
"\n",
"# rebuild storage context\n",
"storage_context = StorageContext.from_defaults(persist_dir=\"./index\")\n",
"\n",
"# load index\n",
"index = load_index_from_storage(storage_context)\n",
"\n",
"# configure retriever for debugging and retrieving metadata \n",
"retriever = VectorIndexRetriever(\n",
" index=index,\n",
" similarity_top_k=15, # get top k documents. 15 is arbitrary\n",
")\n",
"\n",
"# configure response synthesizer\n",
"response_synthesizer = get_response_synthesizer()\n",
"\n",
"# assemble query engine\n",
"query_engine = RetrieverQueryEngine(\n",
" retriever=retriever,\n",
" response_synthesizer=response_synthesizer,\n",
" node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Example of query execute\n",
"response = query_engine.query(\"How do I import a cat from France to Canada?\")\n",
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get top k result into a list, in order of match score\n",
"top_k_result = []\n",
"for i in range(15): # arbitrary 15 because similarity_top_k=15 in this example\n",
" top_k_result.append(response.source_nodes[i])\n",
" \n",
"# get content\n",
"response.source_nodes[0].get_content()\n",
"# get embedding\n",
"response.source_nodes[0].embedding\n",
"# get score\n",
"response.source_nodes[0].get_score()\n",
"# get customized metadata. In this example, this retrieves chunk_id\n",
"response.source_nodes[0].metadata"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
196 changes: 196 additions & 0 deletions notebooks/llamaindex-test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.llms.azure_openai import AzureOpenAI\n",
"from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding\n",
"from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
"import logging\n",
"import sys\n",
"from collections.abc import Iterator\n",
"from sqlalchemy import make_url, create_engine, MetaData\n",
"from llama_index.core import ServiceContext, SimpleDirectoryReader, StorageContext\n",
"from llama_index.core import VectorStoreIndex\n",
"#from llama_index.vector_stores import PGVectorStore\n",
"import textwrap\n",
"import openai\n",
"\n",
"# customize textnode - purpose is to add id to each node\n",
"#from llama_index.schema import TextNode\n",
"# customize stages of querying https://docs.llamaindex.ai/en/latest/understanding/querying/querying.html\n",
"from llama_index.core import get_response_synthesizer\n",
"from llama_index.core.indices.vector_store.retrievers.retriever import VectorIndexRetriever\n",
"from llama_index.core.query_engine import RetrieverQueryEngine\n",
"from llama_index.core.postprocessor import SimilarityPostprocessor\n",
"import pandas as pd\n",
"\n",
"logging.basicConfig(\n",
" stream=sys.stdout, level=logging.INFO\n",
") # logging.DEBUG for more verbose output\n",
"logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"api_key = \"\"\n",
"azure_endpoint = \"\"\n",
"api_version = \"2023-07-01-preview\"\n",
"\n",
"# create llm and embedding model apis\n",
"\n",
"llm = AzureOpenAI(\n",
" model=\"gpt-4\",\n",
" deployment_name=\"ailab-llm\",\n",
" api_key=api_key,\n",
" azure_endpoint=azure_endpoint,\n",
" api_version=api_version,\n",
")\n",
"\n",
"# You need to deploy your own embedding model as well as your own chat completion model\n",
"embed_model = AzureOpenAIEmbedding(\n",
" model=\"text-embedding-ada-002\",\n",
" deployment_name=\"ada\",\n",
" api_key=api_key,\n",
" azure_endpoint=azure_endpoint,\n",
" api_version=api_version,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import Settings\n",
"\n",
"Settings.llm = llm\n",
"Settings.embed_model = embed_model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"storage_context = StorageContext.from_defaults(persist_dir=\"./index\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import StorageContext, load_index_from_storage\n",
"index = load_index_from_storage(storage_context)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# configure retriever for debugging and retrieving metadata \n",
"retriever = VectorIndexRetriever(\n",
" index=index,\n",
" similarity_top_k=15,\n",
")\n",
"# configure response synthesizer\n",
"response_synthesizer = get_response_synthesizer()\n",
"# assemble query engine\n",
"query_engine = RetrieverQueryEngine(\n",
" retriever=retriever,\n",
" response_synthesizer=response_synthesizer,\n",
" node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_q['llamaindex_response'] = ''\n",
"df_q['llamaindex_top_15_doc'] = ''\n",
"df_q['llamaindex_top_15_doc_id'] = ''\n",
"df_q['llamaindex_answer_placement'] = ''\n",
"df_result = df_q.copy()\n",
"def find_string_position(string_list, target_string):\n",
" \"\"\"\n",
" Finds the position of a target string in a list of strings.\n",
"\n",
" Parameters:\n",
" - string_list: A list of strings to search through.\n",
" - target_string: The string to find within the list.\n",
"\n",
" Returns:\n",
" - The index (position) of the target string in the list, or -1 if not found.\n",
" \"\"\"\n",
" for index, string in enumerate(string_list):\n",
" if string == target_string:\n",
" return index\n",
" return -1\n",
"for i in range(len(df_result)):\n",
" print('i', i)\n",
" # query\n",
" response = query_engine.query(df_result.iloc[i]['question'])\n",
" print(response.get_formatted_sources())\n",
" print(\"query was:\", df_result.iloc[i]['question'])\n",
" print(\"answer was:\", response)\n",
" # get top k result into a list, in order of match score\n",
" top_k_result = []\n",
" top_k_result_id = []\n",
" for j in range(15):\n",
" top_k_result.append(response.source_nodes[j])\n",
" top_k_result_id.append(response.source_nodes[j].metadata['id_'])\n",
" #print('top_k_result', top_k_result)\n",
" # get customized metadata\n",
" #response.source_nodes[0].metadata\n",
" df_result.at[i,'llamaindex_response'] = response\n",
" df_result.at[i,'llamaindex_top_15_doc'] = top_k_result\n",
" df_result.at[i,'llamaindex_top_15_doc_id'] = top_k_result_id\n",
" df_result.at[i,'llamaindex_answer_placement'] = find_string_position(top_k_result_id, df_result['chunk_id'].iloc[i])\n",
" #print('df_result-------------------', df_result.iloc[i])\n",
"df = df_result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pd.options.display.max_colwidth = 10000\n",
"df[['question','llamaindex_answer_placement','llamaindex_response']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('./data/good_qna_llamaindex_answer.csv',encoding='utf-8-sig')"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
24 changes: 24 additions & 0 deletions testing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Testing Documentation

This document provides detailed instructions and procedures for manually testing
the various functionalities of Llamaindex, ensuring that all features operate
correctly and as expected before deployment or release.

## Test Case: test

**Objective:** Verify test

**Preconditions:**
- [ ] test.

**Test Steps:**
1. test

**Expected Results:**
- [ ] test

**Actual Results:**
- [ ] test

**Pass/Fail Criteria:**
- [ ] test
Loading