Merge pull request #9 from ai-cfia/6-compare-finesses-score-against-g…

…oogles Incorporate Public Search Engine Comparison
ai-cfia · Apr 11, 2024 · 24965cb · 24965cb
2 parents 72ba3fb + e10be07
commit 24965cb
Show file tree

Hide file tree

Showing 11 changed files with 420 additions and 89 deletions.
diff --git a/.env.template b/.env.template
@@ -0,0 +1,2 @@
+BING_SEARCH_KEY =
+BING_ENDPOINT =
diff --git a/.gitignore b/.gitignore
@@ -43,5 +43,11 @@ flask_session/
 # Ignore local QnA json files
 QnA
 
-# Ignore output of api-test
-output
+# Ignore output of api-test and from the scripts
+output/
+
+# Ignore input of the scripts
+input/
+
+# Ignore the cache directory
+cache/
diff --git a/finesse/FINESSE_USAGE.md b/finesse/FINESSE_USAGE.md
@@ -1,8 +1,8 @@
 # How to use the Finesse Locust script
 
 This tool simplifies the process of comparing different search engines and
-assessing their accuracy. It's designed to be straightforward, making it easy
-to understand and use.
+assessing their accuracy. It's designed to be straightforward, making it easy to
+understand and use.
 
 ## How it Works
 
@@ -16,8 +16,8 @@ to understand and use.
       - `static`: Static search engine
       - `llamaindex`: LlamaIndex search engine
     - `--path [directory path]`: Point to the directory with files structured
-    - `--host [API URL]`: Point to the finesse-backend URL
-      with JSON files with the following properties:
+    - `--host [API URL]`: Point to the finesse-backend URL with JSON files with
+      the following properties:
       - `score`: The score of the page.
       - `crawl_id`: The unique identifier associated with the crawl table.
       - `chunk_id`: The unique identifier of the chunk.
@@ -43,7 +43,8 @@ to understand and use.
 - **Round trip time**
   - Measure round trip time of each request
 - **Summary statistical value**
-  - Measure the average, median, standard deviation, minimum and maximal accuracy scores and round trip time
+  - Measure the average, median, standard deviation, minimum and maximal
+    accuracy scores and round trip time
 
 ## Diagram
 
@@ -100,3 +101,49 @@ Accuracy statistical summary:
 
 This example shows how the CLI Output of the tool, analyzing search results from
 Azure Search and providing an accuracy score for Finesse.
+
+## Scripts
+
+### XLSX Converter to JSON 📄
+
+This script converts data from an Excel file (.xlsx) into JSON format. It is
+used for questions generated created by non-developers. Excel files are easier
+to read than JSON files.
+
+### Usage
+
+1. **Input Excel File**: Place the Excel file containing the data in the
+   specified input folder (`--input-folder`). By default, the input folder is
+   set to `'finesse/scripts/input/'`.
+
+2. **Output Folder**: Specify the folder where the resulting JSON files will be
+   saved using the `--output-folder` argument. By default, the output folder is
+   set to `'finesse/scripts/output/'`.
+
+3. **Input File Name**: Provide the name of the input Excel file using the
+   `--file-name` argument..
+
+4. **Worksheet Name**: Specify the name of the worksheet containing the data
+   using the `--sheet-name` argument. By default, it is set to `'To fill'`.
+
+### Example Command
+
+```bash
+python finesse/scripts/xlsx_converter_json.py --input-folder finesse/scripts/input/ --output-folder finesse/scripts/output/ --file-name Finesse_questions_for_testing.xlsx --sheet-name "To fill"
+```
+
+Replace `'example.xlsx'` with the actual name of your input Excel file and
+`'Sheet1'` with the name of the worksheet containing the data.
+
+### Output
+
+The script generates individual JSON files for each row of data in the specified
+output folder. Each JSON file contains the following fields:
+
+- `question`: The question extracted from the Excel file.
+- `answer`: The answer extracted from the Excel file.
+- `title`: The title(s) extracted from specified columns in the Excel file.
+- `url`: The URL(s) extracted from specified columns in the Excel file.
+
+Upon completion, the script prints "Conversion terminée !" (Conversion
+completed!) to indicate that the conversion process is finished.
diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py
diff --git a/finesse/bing_search.py b/finesse/bing_search.py
@@ -0,0 +1,42 @@
+from azure.cognitiveservices.search.websearch import WebSearchClient
+from msrest.authentication import CognitiveServicesCredentials
+import time
+import statistics
+class BingSearch():
+    """
+    A class for performing web searches using the Bing Search API.
+    """
+
+    def __init__(self, endpoint, subscription_key):
+        self.endpoint = endpoint
+        self.subscription_key = subscription_key
+        self.client = WebSearchClient(endpoint=self.endpoint, credentials=CognitiveServicesCredentials(self.subscription_key))
+        self.client.config.base_url = '{Endpoint}/v7.0' # Temporary change to fix the error. Issue opened https://github.com/Azure/azure-sdk-for-python/issues/34917
+
+    def search_urls(self, query: str, num_results: int = 100) -> tuple[list[str], float]:
+        """
+        Search for URLs using the Bing Search API.
+
+        Args:
+            query (str): The search query.
+            num_results (int, optional): The number of results to retrieve. Defaults to 100.
+
+        Returns:
+            tuple[list[str], float]: A tuple containing a list of URLs and the average elapsed time for the search.
+        """
+        urls = []
+        elapsed_time = []
+        offset = 0
+        # Limit of 50 results per query and Bing Search return less than 50 web results
+        while len(urls) < num_results:
+            start_time = time.time()
+            web_data = self.client.web.search(query=query, market="en-ca", count=50, response_filter=["Webpages"], offset=offset)
+            elapsed_time.append(time.time() - start_time)
+            if hasattr(web_data, 'web_pages') and web_data.web_pages is not None:
+                urls.extend([item.url for item in web_data.web_pages.value])
+            try:
+                offset += len([item.url for item in web_data.web_pages.value])
+            except AttributeError:
+                break
+        urls = urls[:num_results]
+        return urls, statistics.mean(elapsed_time) * 1000
diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py
@@ -2,9 +2,11 @@
 from jsonreader import JSONReader
 import os
 import json
-from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy
+from accuracy_functions import save_to_markdown, save_to_csv, calculate_accuracy, update_dict_bing_data
 from host import is_host_up
 
+global_test_data = dict()
+settings = dict()
 class NoTestDataError(Exception):
     """Raised when all requests have failed and there is no test data"""
 
@@ -58,39 +60,48 @@ def search_accuracy(self):
                 for page in response_pages:
                     response_url.append(page.get("url"))
                 accuracy_result = calculate_accuracy(response_url, expected_url)
-                time_taken = round(response.elapsed.microseconds/1000,3)
-
+                time_taken = round(response.elapsed.total_seconds()*1000,3)
                 expected_page = json_data.copy()
                 del expected_page['question']
                 del expected_page['answer']
-                self.qna_results[file_name] = {
+                global_test_data[file_name] = {
                     "question": question,
                     "expected_page": expected_page,
                     "response_pages": response_pages,
                     "position": accuracy_result.position,
                     "total_pages": accuracy_result.total_pages,
                     "accuracy": accuracy_result.score,
                     "time": time_taken,
+                    "top": self.top,
                 }
 
     def on_start(self):
         self.qna_reader = JSONReader(self.path)
-        self.qna_results = dict()
 
     def on_stop(self):
-        if not self.qna_results:
+        if not global_test_data:
             raise NoTestDataError
 
-        log_data(self.qna_results)
-        if self.format == "md":
-            save_to_markdown(self.qna_results, self.engine)
-        elif self.format == "csv":
-            save_to_csv(self.qna_results, self.engine)
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.path = self.environment.parsed_options.path
         self.engine = self.environment.parsed_options.engine
         self.format = self.environment.parsed_options.format
         self.once = self.environment.parsed_options.once
         self.top = self.environment.parsed_options.top
+        settings["engine"] = self.engine
+        settings["format"] = self.format
+        settings["once"] = self.once
+        settings["top"] = self.top
+        settings["path"] = self.path
+
+
+@events.quit.add_listener
+def quit(**_kwargs):
+    print("Search accuracy test completed")
+    print("Starting bing search test")
+    update_dict_bing_data(global_test_data)
+    if settings.get("format") == "md":
+        save_to_markdown(global_test_data, "azure")
+    elif settings.get("format") == "csv":
+        save_to_csv(global_test_data, settings.get("engine"))
diff --git a/finesse/jsonreader.py b/finesse/jsonreader.py
@@ -1,13 +1,14 @@
 import json
 from typing import Iterator
 import os
-
+from natsort import natsorted
 class JSONReader(Iterator):
     "Read test data from JSON files using an iterator"
 
     def __init__(self, directory):
         self.directory = directory
-        self.file_list = sorted([f for f in os.listdir(directory) if f.endswith('.json')])
+        self.file_list = natsorted([f for f in os.listdir(directory) if f.endswith('.json')])
+
         if not self.file_list:
             raise FileNotFoundError(f"No JSON files found in the directory '{directory}'")
         self.current_file_index = 0

diff --git a/finesse/scripts/xlsx_converter_json.py b/finesse/scripts/xlsx_converter_json.py
@@ -0,0 +1,56 @@
+import openpyxl
+import os
+import json
+import argparse
+
+parser = argparse.ArgumentParser(description='XLSX Converter to JSON')
+parser.add_argument('--input-folder', dest='input_folder', default='finesse/scripts/input/', help='Path to the input folder')
+parser.add_argument('--output-folder', dest='output_folder', default='finesse/scripts/output/', help='Path to the output folder')
+parser.add_argument('--file-name', dest='file_name', help='Name of the input file')
+parser.add_argument('--sheet-name', dest='sheet_name', default='To fill', help='Name of the worksheet')
+
+args = parser.parse_args()
+
+INPUT_FOLDER = args.input_folder
+OUTPUT_FOLDER = args.output_folder
+FILE_NAME = args.file_name
+SHEET_NAME = args.sheet_name
+FILE_PATH = os.path.join(INPUT_FOLDER, FILE_NAME)
+
+workbook = openpyxl.load_workbook(FILE_PATH)
+worksheet = workbook.active
+count = 1
+
+for row in range(5, worksheet.max_row + 1):
+    question = worksheet.cell(row=row, column=2).value
+    if question is None:
+        continue
+
+    answer = worksheet.cell(row=row, column=3).value
+
+    titles = []
+    links = []
+    for col in range(5, 10):
+        title = worksheet.cell(row=row, column=col).value
+        link = worksheet.cell(row=row, column=col).hyperlink
+        if title:
+            titles.append(title)
+        if link:
+            links.append(link.target)
+
+    data = {
+        'question': question or "",
+        'answer': answer or "",
+        'title': titles[0] if len(titles) == 1 else titles or "",
+        'url': links[0] if len(links) == 1 else links or ""
+    }
+
+    # Enregistrement du fichier JSON
+    output_file = os.path.join(OUTPUT_FOLDER, f'question_{count}.json')
+    if not os.path.exists(OUTPUT_FOLDER):
+        os.makedirs(OUTPUT_FOLDER)
+    with open(output_file, 'w', encoding='utf-8') as json_file:
+        json.dump(data, json_file, ensure_ascii=False, indent=4)
+    count += 1
+
+print("Conversion completed successfully!")
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,7 @@
 locust
 regex
+python-dotenv
+azure-cognitiveservices-search-websearch
+msrest
+openpyxl
+natsort
diff --git a/tests/test_accuracy_functions.py b/tests/test_accuracy_functions.py
@@ -16,5 +16,23 @@ def test_calculate_accuracy(self):
         self.assertEqual(result.total_pages, 4)
         self.assertEqual(result.score, 0.75)
 
+    def test_calculate_accuracy_multiple_expected_urls(self):
+        responses_url = [
+            "https://inspection.canada.ca/exporting-food-plants-or-animals/food-exports/food-specific-export-requirements/meat/crfpcp/eng/1434119937443/1434120400252",
+            "https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-08-04/fra/1323752901318/1323753612811",
+            "https://inspection.canada.ca/varietes-vegetales/vegetaux-a-caracteres-nouveaux/demandeurs/directive-94-08/documents-sur-la-biologie/lens-culinaris-medikus-lentille-/fra/1330978380871/1330978449837",
+            "https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-96-15/fra/1323854808025/1323854941807"
+        ]
+        expected_urls = [
+            "https://inspection.canada.ca/animal-health/terrestrial-animals/exports/pets/brunei-darussalam/eng/1475849543824/1475849672294",
+            "https://inspection.canada.ca/animal-health/terrestrial-animals/exports/pets/eu-commercial-/instructions/eng/1447782811647/1447782887583",
+            "https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-96-15/fra/1323854808025/1323854941807",
+            "https://inspection.canada.ca/varietes-vegetales/vegetaux-a-caracteres-nouveaux/demandeurs/directive-94-08/documents-sur-la-biologie/lens-culinaris-medikus-lentille-/fra/1330978380871/1330978449837"
+        ]
+        result = calculate_accuracy(responses_url, expected_urls)
+        self.assertEqual(result.position, 2)
+        self.assertEqual(result.total_pages, 4)
+        self.assertEqual(result.score, 0.5)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_bing_search.py b/tests/test_bing_search.py
@@ -0,0 +1,22 @@
+import unittest
+from finesse.bing_search import BingSearch
+from dotenv import load_dotenv
+import os
+class TestBingSearch(unittest.TestCase):
+    def test_search_urls(self):
+        load_dotenv()
+        endpoint = os.getenv("BING_ENDPOINT")
+        subscription_key = os.getenv("BING_SEARCH_KEY")
+        bing_search = BingSearch(endpoint, subscription_key)
+
+        query = "Canadian Food Inspection Agency"
+        num_results = 100
+
+        urls, elapsed_time = bing_search.search_urls(query, num_results)
+
+        self.assertEqual(len(urls), num_results)
+        self.assertTrue(all(url.startswith("http") for url in urls))
+        self.assertIsInstance(elapsed_time, float)
+
+if __name__ == "__main__":
+    unittest.main()