ai-cfia · leejaeka · Feb 1, 2024 · Feb 1, 2024 · Feb 8, 2024 · Feb 13, 2024
diff --git a/LICENSE b/LICENSE
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
diff --git a/data/QnA/good_question.csv b/data/QnA/good_question.csv
diff --git a/data/document_test/chunk_slice.csv b/data/document_test/chunk_slice.csv
diff --git a/index/graph_store.json b/index/graph_store.json
@@ -0,0 +1 @@
+{"graph_dict": {}}
diff --git a/index/image__vector_store.json b/index/image__vector_store.json
@@ -0,0 +1 @@
+{"embedding_dict": {}, "text_id_to_ref_doc_id": {}, "metadata_dict": {}}
diff --git a/index/index_store.json b/index/index_store.json
diff --git a/notebooks/llamaindex-load.ipynb b/notebooks/llamaindex-load.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install llama-index==0.9.36\n",
+    "!pip install asyncpg==0.29.0\n",
+    "%pip install openai --upgrade\n",
+    "dbutils.library.restartPython()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms import AzureOpenAI\n",
+    "from llama_index.embeddings import AzureOpenAIEmbedding\n",
+    "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
+    "import logging\n",
+    "import sys\n",
+    "from collections.abc import Iterator\n",
+    "from llama_index import ServiceContext, SimpleDirectoryReader, StorageContext\n",
+    "from llama_index.indices.vector_store import VectorStoreIndex\n",
+    "from llama_index.vector_stores import PGVectorStore\n",
+    "import textwrap\n",
+    "import openai\n",
+    "from llama_index import download_loader\n",
+    "# customize textnode - purpose is to add id to each node\n",
+    "from llama_index.schema import TextNode\n",
+    "# customize stages of querying  https://docs.llamaindex.ai/en/latest/understanding/querying/querying.html\n",
+    "from llama_index import get_response_synthesizer\n",
+    "from llama_index.retrievers import VectorIndexRetriever\n",
+    "from llama_index.query_engine import RetrieverQueryEngine\n",
+    "from llama_index.postprocessor import SimilarityPostprocessor\n",
+    "from llama_index import StorageContext, load_index_from_storage\n",
+    "\n",
+    "logging.basicConfig(\n",
+    "    stream=sys.stdout, level=logging.INFO\n",
+    ")  # logging.DEBUG for more verbose output\n",
+    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download query engine from Azure Blob Storage Container\n",
+    "from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient\n",
+    "\n",
+    "# Your storage account connection string\n",
+    "connection_string = \"DefaultEndpointsProtocol=https;AccountName=finessetestblobstorage;AccountKey=;EndpointSuffix=core.windows.net\"\n",
+    "\n",
+    "# The name of your container\n",
+    "container_name = \"llamaindex-v1\"\n",
+    "\n",
+    "# The name of the virtual folder you want to list files from\n",
+    "folder_name = \"index\"\n",
+    "\n",
+    "# Initialize the BlobServiceClient\n",
+    "blob_service_client = BlobServiceClient.from_connection_string(connection_string)\n",
+    "\n",
+    "# Get the container client\n",
+    "container_client = blob_service_client.get_container_client(container_name)\n",
+    "\n",
+    "# List all blobs in the specified folder\n",
+    "blobs_list = container_client.list_blobs(name_starts_with=folder_name)\n",
+    "\n",
+    "# List all blobs in the container (at the root)\n",
+    "blobs_list = container_client.list_blobs()\n",
+    "\n",
+    "for blob in blobs_list:\n",
+    "    print(\"Blob name: \" + blob.name)\n",
+    "    blob_name = blob.name\n",
+    "    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)\n",
+    "    # Download the blob to a local file\n",
+    "    download_file_path = \"./index/\" + blob_name\n",
+    "    with open(download_file_path, \"wb\") as download_file:\n",
+    "        download_file.write(blob_client.download_blob().readall())\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load query engine from local folder, 'index'\n",
+    "from llama_index import StorageContext, load_index_from_storage\n",
+    "\n",
+    "# rebuild storage context\n",
+    "storage_context = StorageContext.from_defaults(persist_dir=\"./index\")\n",
+    "\n",
+    "# load index\n",
+    "index = load_index_from_storage(storage_context)\n",
+    "\n",
+    "# configure retriever for debugging and retrieving metadata \n",
+    "retriever = VectorIndexRetriever(\n",
+    "    index=index,\n",
+    "    similarity_top_k=15, # get top k documents.  15 is arbitrary\n",
+    ")\n",
+    "\n",
+    "# configure response synthesizer\n",
+    "response_synthesizer = get_response_synthesizer()\n",
+    "\n",
+    "# assemble query engine\n",
+    "query_engine = RetrieverQueryEngine(\n",
+    "    retriever=retriever,\n",
+    "    response_synthesizer=response_synthesizer,\n",
+    "    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example of query execute\n",
+    "response = query_engine.query(\"How do I import a cat from France to Canada?\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get top k result into a list, in order of match score\n",
+    "top_k_result = []\n",
+    "for i in range(15): # arbitrary 15 because similarity_top_k=15 in this example\n",
+    "    top_k_result.append(response.source_nodes[i])\n",
+    "    \n",
+    "# get content\n",
+    "response.source_nodes[0].get_content()\n",
+    "# get embedding\n",
+    "response.source_nodes[0].embedding\n",
+    "# get score\n",
+    "response.source_nodes[0].get_score()\n",
+    "# get customized metadata. In this example, this retrieves chunk_id\n",
+    "response.source_nodes[0].metadata"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/llamaindex-test.ipynb b/notebooks/llamaindex-test.ipynb
@@ -0,0 +1,196 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.azure_openai import AzureOpenAI\n",
+    "from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding\n",
+    "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
+    "import logging\n",
+    "import sys\n",
+    "from collections.abc import Iterator\n",
+    "from sqlalchemy import make_url, create_engine, MetaData\n",
+    "from llama_index.core import ServiceContext, SimpleDirectoryReader, StorageContext\n",
+    "from llama_index.core import VectorStoreIndex\n",
+    "#from llama_index.vector_stores import PGVectorStore\n",
+    "import textwrap\n",
+    "import openai\n",
+    "\n",
+    "# customize textnode - purpose is to add id to each node\n",
+    "#from llama_index.schema import TextNode\n",
+    "# customize stages of querying  https://docs.llamaindex.ai/en/latest/understanding/querying/querying.html\n",
+    "from llama_index.core import get_response_synthesizer\n",
+    "from llama_index.core.indices.vector_store.retrievers.retriever import VectorIndexRetriever\n",
+    "from llama_index.core.query_engine import RetrieverQueryEngine\n",
+    "from llama_index.core.postprocessor import SimilarityPostprocessor\n",
+    "import pandas as pd\n",
+    "\n",
+    "logging.basicConfig(\n",
+    "    stream=sys.stdout, level=logging.INFO\n",
+    ")  # logging.DEBUG for more verbose output\n",
+    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "api_key = \"\"\n",
+    "azure_endpoint = \"\"\n",
+    "api_version = \"2023-07-01-preview\"\n",
+    "\n",
+    "# create llm and embedding model apis\n",
+    "\n",
+    "llm = AzureOpenAI(\n",
+    "    model=\"gpt-4\",\n",
+    "    deployment_name=\"ailab-llm\",\n",
+    "    api_key=api_key,\n",
+    "    azure_endpoint=azure_endpoint,\n",
+    "    api_version=api_version,\n",
+    ")\n",
+    "\n",
+    "# You need to deploy your own embedding model as well as your own chat completion model\n",
+    "embed_model = AzureOpenAIEmbedding(\n",
+    "    model=\"text-embedding-ada-002\",\n",
+    "    deployment_name=\"ada\",\n",
+    "    api_key=api_key,\n",
+    "    azure_endpoint=azure_endpoint,\n",
+    "    api_version=api_version,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import Settings\n",
+    "\n",
+    "Settings.llm = llm\n",
+    "Settings.embed_model = embed_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "storage_context = StorageContext.from_defaults(persist_dir=\"./index\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import StorageContext, load_index_from_storage\n",
+    "index = load_index_from_storage(storage_context)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# configure retriever for debugging and retrieving metadata \n",
+    "retriever = VectorIndexRetriever(\n",
+    "    index=index,\n",
+    "    similarity_top_k=15,\n",
+    ")\n",
+    "# configure response synthesizer\n",
+    "response_synthesizer = get_response_synthesizer()\n",
+    "# assemble query engine\n",
+    "query_engine = RetrieverQueryEngine(\n",
+    "    retriever=retriever,\n",
+    "    response_synthesizer=response_synthesizer,\n",
+    "    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_q['llamaindex_response'] = ''\n",
+    "df_q['llamaindex_top_15_doc'] = ''\n",
+    "df_q['llamaindex_top_15_doc_id'] = ''\n",
+    "df_q['llamaindex_answer_placement'] = ''\n",
+    "df_result = df_q.copy()\n",
+    "def find_string_position(string_list, target_string):\n",
+    "    \"\"\"\n",
+    "    Finds the position of a target string in a list of strings.\n",
+    "\n",
+    "    Parameters:\n",
+    "    - string_list: A list of strings to search through.\n",
+    "    - target_string: The string to find within the list.\n",
+    "\n",
+    "    Returns:\n",
+    "    - The index (position) of the target string in the list, or -1 if not found.\n",
+    "    \"\"\"\n",
+    "    for index, string in enumerate(string_list):\n",
+    "        if string == target_string:\n",
+    "            return index\n",
+    "    return -1\n",
+    "for i in range(len(df_result)):\n",
+    "    print('i', i)\n",
+    "    # query\n",
+    "    response = query_engine.query(df_result.iloc[i]['question'])\n",
+    "    print(response.get_formatted_sources())\n",
+    "    print(\"query was:\", df_result.iloc[i]['question'])\n",
+    "    print(\"answer was:\", response)\n",
+    "    # get top k result into a list, in order of match score\n",
+    "    top_k_result = []\n",
+    "    top_k_result_id = []\n",
+    "    for j in range(15):\n",
+    "        top_k_result.append(response.source_nodes[j])\n",
+    "        top_k_result_id.append(response.source_nodes[j].metadata['id_'])\n",
+    "    #print('top_k_result', top_k_result)\n",
+    "    # get customized metadata\n",
+    "    #response.source_nodes[0].metadata\n",
+    "    df_result.at[i,'llamaindex_response'] = response\n",
+    "    df_result.at[i,'llamaindex_top_15_doc'] = top_k_result\n",
+    "    df_result.at[i,'llamaindex_top_15_doc_id'] = top_k_result_id\n",
+    "    df_result.at[i,'llamaindex_answer_placement'] = find_string_position(top_k_result_id, df_result['chunk_id'].iloc[i])\n",
+    "    #print('df_result-------------------', df_result.iloc[i])\n",
+    "df = df_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.options.display.max_colwidth = 10000\n",
+    "df[['question','llamaindex_answer_placement','llamaindex_response']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv('./data/good_qna_llamaindex_answer.csv',encoding='utf-8-sig')"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/testing.md b/testing.md
@@ -0,0 +1,24 @@
+# Testing Documentation
+
+This document provides detailed instructions and procedures for manually testing
+the various functionalities of Llamaindex, ensuring that all features operate
+correctly and as expected before deployment or release.
+
+## Test Case: test
+
+**Objective:** Verify test
+
+**Preconditions:**
+- [ ] test.
+
+**Test Steps:**
+1. test
+
+**Expected Results:**
+- [ ] test
+
+**Actual Results:**
+- [ ] test
+
+**Pass/Fail Criteria:**
+- [ ] test
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"embedding_dict": {}, "text_id_to_ref_doc_id": {}, "metadata_dict": {}}
rngadam marked this conversation as resolved. Show resolved Hide resolved