ai-cfia · ibrahim-kabir · Apr 11, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/.env.template b/.env.template
@@ -1,2 +1,4 @@
 BING_SEARCH_KEY =
 BING_ENDPOINT =
+CACHE_PATH =
+OUTPUT_FOLDER =
diff --git a/.gitignore b/.gitignore
@@ -41,13 +41,14 @@ keys/
 flask_session/
 
 # Ignore local QnA json files
-QnA
+QnA/
 
 # Ignore output of api-test and from the scripts
 output/
 
 # Ignore input of the scripts
 input/
 
+
 # Ignore the cache directory
 cache/
diff --git a/finesse/FINESSE_USAGE.md b/finesse/FINESSE_USAGE.md
@@ -4,6 +4,35 @@ This tool simplifies the process of comparing different search engines and
 assessing their accuracy. It's designed to be straightforward, making it easy to
 understand and use.
 
+## Configuration
+
+Before using the Finesse Locust script, make sure to set up the necessary
+configuration. Follow the steps below:
+
+1. Create a `.env` file in the root directory of the project if it doesn't
+   already exist.
+
+2. Copy the contents of the `.env.template` file and paste them into the `.env`
+   file.
+
+3. Replace the placeholder values in the `.env` file with your actual secrets
+   and configuration settings. In particular, you will need to provide the
+   necessary credentials for the Bing Search API.
+
+## Caching
+
+Finesse supports caching to improve performance and reduce costs. If you already
+have a cache directory from a previous usage, you can reuse it by placing it in
+the `finesse` directory. If you don't have a cache directory, Finesse will
+automatically create one for you.
+
+The cache directory is used to store expensive API requests, so they don't need
+to be repeated unnecessarily. This can significantly speed up subsequent runs of
+the Finesse Locust script.
+
+Make sure that the cache directory has the appropriate read and write
+permissions for the user running the script.
+
 ## How it Works
 
 - **Single command:**
@@ -70,7 +99,7 @@ sequenceDiagram
   finesse-test-->>User: Display results
 ```
 
-## Example Command
+## Example Command for Locust Script
 
 ```cmd
 $locust -f finesse/finesse_test.py --engine azure --path finesse/QnA/sorted-2024-02-22/  --host https://finesse.inspection.alpha.canada.ca/api --once
@@ -126,7 +155,7 @@ to read than JSON files.
 4. **Worksheet Name**: Specify the name of the worksheet containing the data
    using the `--sheet-name` argument. By default, it is set to `'To fill'`.
 
-### Example Command
+### Example Command for XLSX Converter Script
 
 ```bash
 python finesse/scripts/xlsx_converter_json.py --input-folder finesse/scripts/input/ --output-folder finesse/scripts/output/ --file-name Finesse_questions_for_testing.xlsx --sheet-name "To fill"

diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py
@@ -2,12 +2,15 @@
 import datetime
 import csv
 import os
+import re
+
+from dotenv import load_dotenv
 from collections import namedtuple
-import regex as re
+
 from finesse.bing_search import BingSearch
-from dotenv import load_dotenv
 
-OUTPUT_FOLDER = "./finesse/output"
+load_dotenv()
+OUTPUT_FOLDER = os.getenv("OUTPUT_FOLDER","./finesse/output")
 AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"])
 
 def calculate_accuracy(responses_url: list[str], expected_url: list | str) -> AccuracyResult:
@@ -50,16 +53,36 @@ def calculate_accuracy(responses_url: list[str], expected_url: list | str) -> Ac
 
     return AccuracyResult(position, total_pages, score)
 
+def count_top_results(test_data, num_results, accuracy_type):
+            """
+            Counts the number of correct URLs in the top results based on the specified accuracy type.
+
+            Args:
+                test_data (dict): A dictionary containing the test data.
+                num_results (int): The number of top results to consider.
+                accuracy_type (str): The type of accuracy to consider ex: "accuracy", "bing_accuracy", or "bing_filtered_accuracy".
+
+            Returns:
+                int: The count of correct URLs in the top results.
+            """
+            count = 0
+            for key, value in test_data.items():
+                accuracy = value.get(accuracy_type)
+                if accuracy > 1.0 - (num_results/100):
+                    count += 1
+            return count
+
 def save_to_markdown(test_data: dict, engine: str):
+
     if not os.path.exists(OUTPUT_FOLDER):
         os.makedirs(OUTPUT_FOLDER)
     date_string = datetime.datetime.now().strftime("%Y-%m-%d")
     file_name = f"test_{engine}_{date_string}.md"
     output_file = os.path.join(OUTPUT_FOLDER, file_name)
     with open(output_file, "w") as md_file:
-        md_file.write(f"# Test on the {engine} search engine: {date_string}\n\n")
+        md_file.write(f"# Test on the {engine.title()} search engine: {date_string}\n\n")
         md_file.write("## Test data table\n\n")
-        md_file.write("| 📄 File | 💬 Question| 🔎 Finesse Accuracy Score | 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |⌛ Finesse Time | ⌛ Bing Time | ⌛ Filtered Bing Time |\n")
+        md_file.write(f"| 📄 File | 💬 Question| 🔎 {engine.title()} Accuracy Score | 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |⌛ {engine.title()} Time | ⌛ Bing Time | ⌛ Filtered Bing Time |\n")
         md_file.write("|---|---|---|---|---|---|---|---|\n")
         for key, value in test_data.items():
             question = ""
@@ -77,35 +100,36 @@ def save_to_markdown(test_data: dict, engine: str):
 
         time_stats, accuracy_stats, bing_accuracy_stats, bing_time_stats, bing_filtered_accuracy_stats, bing_filtered_time_stats = calculate_statistical_summary(test_data)
         md_file.write("## Statistical summary\n\n")
-        md_file.write("| Statistic\Engine | 🔎 Finesse Accuracy score| 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |⌛  Finesse Time |  ⌛ Bing Time | ⌛ Filtered Bing Time |\n")
+        md_file.write(f"| Statistic\Engine | 🔎 {engine.title()} Accuracy score| 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |⌛  {engine.title()} Time |  ⌛ Bing Time | ⌛ Filtered Bing Time |\n")
+
         md_file.write("|---|---|---|---|---|---|---|\n")
         for stat in ["Mean", "Median", "Standard Deviation", "Maximum", "Minimum"]:
             md_file.write(f"|{stat}| {accuracy_stats.get(stat)}% | {bing_accuracy_stats.get(stat)}% | {bing_filtered_accuracy_stats.get(stat)}% |{time_stats.get(stat)}ms | {bing_time_stats.get(stat)}ms | {bing_filtered_time_stats.get(stat)}ms |\n")
 
-        md_file.write("\n## Count of null and top scores\n\n")
-        md_file.write("| Score\Engine | 🔎 Finesse Accuracy score| 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |\n")
+        md_file.write("\n## Count of top results\n\n")
+        md_file.write(f"| Count\Engine | 🔎 {engine.title()} Accuracy score| 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |\n")
         md_file.write("|---|---|---|---|\n")
-        finesse_null, finesse_top = count_null_top_scores({key: value.get("accuracy") for key, value in test_data.items()})
-        bing_null, bing_top = count_null_top_scores({key: value.get("bing_accuracy") for key, value in test_data.items()})
-        bing_filtered_null, bing_filtered_top = count_null_top_scores({key: value.get("bing_filtered_accuracy") for key, value in test_data.items()})
+        finesse_top_1 = count_top_results(test_data, 1, "accuracy")
+        bing_top_1 = count_top_results(test_data, 1, "bing_accuracy")
+        bing_filtered_top_1 = count_top_results(test_data, 1, "bing_filtered_accuracy")
+        md_file.write(f"| 🏆 Top 1  | {finesse_top_1} | {bing_top_1} | {bing_filtered_top_1} |\n")
 
-        md_file.write(f"| Null (0%) | {finesse_null} | {bing_null} |{bing_filtered_null} |\n")
-        md_file.write(f"| Top (100%)| {finesse_top} | {bing_top} |{bing_filtered_top} |\n")
+        finesse_top_3 = count_top_results(test_data, 3, "accuracy")
+        bing_top_3 = count_top_results(test_data, 3, "bing_accuracy")
+        bing_filtered_top_3 = count_top_results(test_data, 3, "bing_filtered_accuracy")
+        md_file.write(f"| ✅ Top 3  | {finesse_top_3} | {bing_top_3} | {bing_filtered_top_3} |\n")
 
-def count_null_top_scores(accuracy_scores: dict):
-    """
-    Counts the number of null scores and top scores in the given accuracy_scores dictionary.
-
-    Args:
-        accuracy_scores (dict): A dictionary containing accuracy scores.
+        finesse_top_5 = count_top_results(test_data, 5, "accuracy")
+        bing_top_5 = count_top_results(test_data, 5, "bing_accuracy")
+        bing_filtered_top_5 = count_top_results(test_data, 5, "bing_filtered_accuracy")
+        md_file.write(f"|✅ Top 5 | {finesse_top_5} | {bing_top_5} | {bing_filtered_top_5} |\n")
 
-    Returns:
-        tuple: A tuple containing the count of null scores and top scores, respectively.
-    """
-    null_scores = len([score for score in accuracy_scores.values() if score == 0])
-    top_scores = len([score for score in accuracy_scores.values() if score == 1])
+        finesse_top_10 = count_top_results(test_data, 10, "accuracy")
+        bing_top_10 = count_top_results(test_data, 10, "bing_accuracy")
+        bing_filtered_top_10 = count_top_results(test_data, 10, "bing_filtered_accuracy")
+        md_file.write(f"|✅ Top 10 | {finesse_top_10} | {bing_top_10} | {bing_filtered_top_10} |\n")
 
-    return null_scores, top_scores
+        md_file.write(f"| ❌ Not in top 10  | {len(test_data) - finesse_top_10} | {len(test_data) - bing_top_10} | {len(test_data) - bing_filtered_top_10} |\n")
 
 def save_to_csv(test_data: dict, engine: str):
     if not os.path.exists(OUTPUT_FOLDER):
@@ -115,7 +139,7 @@ def save_to_csv(test_data: dict, engine: str):
     output_file = os.path.join(OUTPUT_FOLDER, file_name)
     with open(output_file, "w", newline="") as csv_file:
         writer = csv.writer(csv_file)
-        writer.writerow(["File", "Question", "Finesse Accuracy Score", "Bing Accuracy Score", "Filtered Bing Accuracy Score", "Finesse Time", "Bing Time", "Filtered Bing Time"])
+        writer.writerow(["File", "Question", "Links", "Finesse Accuracy Score", "Bing Accuracy Score", "Filtered Bing Accuracy Score", "Finesse Time", "Bing Time", "Filtered Bing Time"])
         for key, value in test_data.items():
             question = ""
             if isinstance(value.get("expected_page").get("url"), list):
@@ -126,24 +150,16 @@ def save_to_csv(test_data: dict, engine: str):
                 question = f"[{value.get('question')}]({value.get('expected_page').get('url')})"
             writer.writerow([
                 key,
-                question,
-                f"{int(value.get('accuracy')*100)}%",
-                f"{int(value.get('bing_accuracy')*100)}%",
-                f"{int(value.get('bing_filtered_accuracy')*100)}%",
-                f"{int(value.get('time'))}ms",
-                f"{int(value.get('bing_time'))}ms",
-                f"{int(value.get('bing_filtered_time'))}ms"
+                value.get('question'),
+                f"{int(value.get('accuracy')*100)}",
+                f"{int(value.get('bing_accuracy')*100)}",
+                f"{int(value.get('bing_filtered_accuracy')*100)}",
+                f"{int(value.get('time'))}",
+                f"{int(value.get('bing_time'))}",
+                f"{int(value.get('bing_filtered_time'))}"
             ])
         writer.writerow([])
 
-        time_stats, accuracy_stats, bing_accuracy_stats, bing_time_stats, bing_filtered_accuracy_stats, bing_filtered_time_stats = calculate_statistical_summary(test_data)
-        writer.writerow(["Statistic", "Finesse Accuracy Score", "Bing Accuracy Score", "Filtered Bing Accuracy Score", "Finesse Time", "Bing Time", "Filtered Bing Time"])
-        writer.writerow(["Mean", f"{accuracy_stats.get('Mean')}%", f"{bing_accuracy_stats.get('Mean')}%", f"{bing_filtered_accuracy_stats.get('Mean')}%", f"{time_stats.get('Mean')}ms", f"{bing_time_stats.get('Mean')}ms", f"{bing_filtered_time_stats.get('Mean')}ms"])
-        writer.writerow(["Median", f"{accuracy_stats.get('Median')}%", f"{bing_accuracy_stats.get('Median')}%", f"{bing_filtered_accuracy_stats.get('Median')}%", f"{time_stats.get('Median')}ms", f"{bing_time_stats.get('Median')}ms", f"{bing_filtered_time_stats.get('Median')}ms"])
-        writer.writerow(["Standard Deviation", f"{accuracy_stats.get('Standard Deviation')}%", f"{bing_accuracy_stats.get('Standard Deviation')}%", f"{bing_filtered_accuracy_stats.get('Standard Deviation')}%", f"{time_stats.get('Standard Deviation')}ms", f"{bing_time_stats.get('Standard Deviation')}ms", f"{bing_filtered_time_stats.get('Standard Deviation')}ms"])
-        writer.writerow(["Maximum", f"{accuracy_stats.get('Maximum')}%", f"{bing_accuracy_stats.get('Maximum')}%", f"{bing_filtered_accuracy_stats.get('Maximum')}%", f"{time_stats.get('Maximum')}ms", f"{bing_time_stats.get('Maximum')}ms", f"{bing_filtered_time_stats.get('Maximum')}ms"])
-        writer.writerow(["Minimum", f"{accuracy_stats.get('Minimum')}%", f"{bing_accuracy_stats.get('Minimum')}%", f"{bing_filtered_accuracy_stats.get('Minimum')}%", f"{time_stats.get('Minimum')}ms", f"{bing_time_stats.get('Minimum')}ms", f"{bing_filtered_time_stats.get('Minimum')}ms"])
-
 def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict, dict, dict, dict, dict]:
     """
     Calculate the statistical summary of the test data.
@@ -211,7 +227,9 @@ def update_dict_bing_data(test_data: dict):
     load_dotenv()
     endpoint = os.getenv("BING_ENDPOINT")
     subscription_key = os.getenv("BING_SEARCH_KEY")
-    search_engine = BingSearch(endpoint, subscription_key)
+    cache_path = os.getenv("CACHE_PATH", "finesse/cache/")
+    search_engine = BingSearch(endpoint, subscription_key, cache_path)
+
     count = 1
     for key, value in copy_data.items():
         question = value.get("question")

diff --git a/finesse/bing_search.py b/finesse/bing_search.py
@@ -1,17 +1,23 @@
-from azure.cognitiveservices.search.websearch import WebSearchClient
-from msrest.authentication import CognitiveServicesCredentials
 import time
 import statistics
+
+from azure.cognitiveservices.search.websearch import WebSearchClient
+from msrest.authentication import CognitiveServicesCredentials
+from joblib import Memory
+
 class BingSearch():
     """
     A class for performing web searches using the Bing Search API.
     """
 
-    def __init__(self, endpoint, subscription_key):
+    def __init__(self, endpoint, subscription_key, cache_dir):
         self.endpoint = endpoint
         self.subscription_key = subscription_key
         self.client = WebSearchClient(endpoint=self.endpoint, credentials=CognitiveServicesCredentials(self.subscription_key))
         self.client.config.base_url = '{Endpoint}/v7.0' # Temporary change to fix the error. Issue opened https://github.com/Azure/azure-sdk-for-python/issues/34917
+        self.cache_dir = cache_dir
+        self.memory = Memory(cache_dir, verbose=0)
+        self.search_urls = self.memory.cache(self.search_urls, ignore=['self'])
 
     def search_urls(self, query: str, num_results: int = 100) -> tuple[list[str], float]:
         """

diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py
@@ -1,9 +1,11 @@
-from locust import HttpUser, task, events
-from jsonreader import JSONReader
 import os
 import json
+
+from locust import HttpUser, task, events
+
 from accuracy_functions import save_to_markdown, save_to_csv, calculate_accuracy, update_dict_bing_data
 from host import is_host_up
+from jsonreader import JSONReader
 
 global_test_data = dict()
 settings = dict()
@@ -45,38 +47,34 @@ def search_accuracy(self):
                 print("Stopping the running test")
                 self.environment.runner.quit()
 
-        if self.engine in ["ai-lab", "azure", "static"]:
-            question = json_data.get("question")
-            expected_url = json_data.get("url")
-            file_name = self.qna_reader.file_name
-            response_url : list[str] = []
-            search_url = f"{self.host}/search/{self.engine}?top={self.top}"
-            data = json.dumps({'query': f'{question}'})
-            headers = { "Content-Type": "application/json" }
-            response = self.client.post(search_url, data=data, headers=headers)
-
-            if response.status_code == 200:
-                response_pages = response.json()
-                for page in response_pages:
-                    response_url.append(page.get("url"))
-                accuracy_result = calculate_accuracy(response_url, expected_url)
-                time_taken = round(response.elapsed.total_seconds()*1000,3)
-                expected_page = json_data.copy()
-                del expected_page['question']
-                del expected_page['answer']
-                global_test_data[file_name] = {
-                    "question": question,
-                    "expected_page": expected_page,
-                    "response_pages": response_pages,
-                    "position": accuracy_result.position,
-                    "total_pages": accuracy_result.total_pages,
-                    "accuracy": accuracy_result.score,
-                    "time": time_taken,
-                    "top": self.top,
-                }
+        question = json_data.get("question")
+        expected_url = json_data.get("url")
+        file_name = self.qna_reader.file_name
+        response_url : list[str] = []
+        search_url = f"{self.host}/search/{self.engine}?top={self.top}"
+        data = json.dumps({'query': f'{question}'})
+        headers = { "Content-Type": "application/json" }
+        response = self.client.post(search_url, data=data, headers=headers)
 
-    def on_start(self):
-        self.qna_reader = JSONReader(self.path)
+        if response.status_code == 200:
+            response_pages = response.json()
+            for page in response_pages:
+                response_url.append(page.get("url"))
+            accuracy_result = calculate_accuracy(response_url, expected_url)
+            time_taken = round(response.elapsed.total_seconds()*1000, 3)
+            expected_page = json_data.copy()
+            del expected_page['question']
+            del expected_page['answer']
+            global_test_data[file_name] = {
+                "question": question,
+                "expected_page": expected_page,
+                "response_pages": response_pages,
+                "position": accuracy_result.position,
+                "total_pages": accuracy_result.total_pages,
+                "accuracy": accuracy_result.score,
+                "time": time_taken,
+                "top": self.top,
+            }
 
     def on_stop(self):
         if not global_test_data:
@@ -94,14 +92,14 @@ def __init__(self, *args, **kwargs):
         settings["once"] = self.once
         settings["top"] = self.top
         settings["path"] = self.path
-
+        self.qna_reader = JSONReader(self.path)
 
 @events.quit.add_listener
 def quit(**_kwargs):
     print("Search accuracy test completed")
     print("Starting bing search test")
     update_dict_bing_data(global_test_data)
     if settings.get("format") == "md":
-        save_to_markdown(global_test_data, "azure")
+        save_to_markdown(global_test_data,  settings.get("engine"))
     elif settings.get("format") == "csv":
         save_to_csv(global_test_data, settings.get("engine"))