Skip to content

Commit

Permalink
issue #6: Refactored + Fix markdown issue on links + Add parsing scri…
Browse files Browse the repository at this point in the history
…pt to the repo+ Review csv function+ Sort files by number
  • Loading branch information
ibrahim-kabir committed Mar 27, 2024
1 parent 478660d commit 43698da
Show file tree
Hide file tree
Showing 10 changed files with 254 additions and 45 deletions.
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,8 @@ flask_session/
# Ignore local QnA json files
QnA

# Ignore output of api-test
output
# Ignore output of api-test and from the scripts
output/

# Ignore input of the scripts
input/
57 changes: 52 additions & 5 deletions finesse/FINESSE_USAGE.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# How to use the Finesse Locust script

This tool simplifies the process of comparing different search engines and
assessing their accuracy. It's designed to be straightforward, making it easy
to understand and use.
assessing their accuracy. It's designed to be straightforward, making it easy to
understand and use.

## How it Works

Expand All @@ -16,8 +16,8 @@ to understand and use.
- `static`: Static search engine
- `llamaindex`: LlamaIndex search engine
- `--path [directory path]`: Point to the directory with files structured
- `--host [API URL]`: Point to the finesse-backend URL
with JSON files with the following properties:
- `--host [API URL]`: Point to the finesse-backend URL with JSON files with
the following properties:
- `score`: The score of the page.
- `crawl_id`: The unique identifier associated with the crawl table.
- `chunk_id`: The unique identifier of the chunk.
Expand All @@ -43,7 +43,8 @@ to understand and use.
- **Round trip time**
- Measure round trip time of each request
- **Summary statistical value**
- Measure the average, median, standard deviation, minimum and maximal accuracy scores and round trip time
- Measure the average, median, standard deviation, minimum and maximal
accuracy scores and round trip time

## Diagram

Expand Down Expand Up @@ -100,3 +101,49 @@ Accuracy statistical summary:

This example shows how the CLI Output of the tool, analyzing search results from
Azure Search and providing an accuracy score for Finesse.

## Scripts

### XLSX Converter to JSON 📄

This script converts data from an Excel file (.xlsx) into JSON format. It is
used for questions generated created by non-developers. It is more readable an
excel than a json file.

### Usage

1. **Input Excel File**: Place the Excel file containing the data in the
specified input folder (`--input-folder`). By default, the input folder is
set to `'finesse/scripts/input/'`.

2. **Output Folder**: Specify the folder where the resulting JSON files will be
saved using the `--output-folder` argument. By default, the output folder is
set to `'finesse/scripts/output/'`.

3. **Input File Name**: Provide the name of the input Excel file using the
`--file-name` argument..

4. **Worksheet Name**: Specify the name of the worksheet containing the data
using the `--sheet-name` argument. By default, it is set to `'To fill'`.

### Example Command

```bash
python finesse/scripts/xlsx_converter_json.py --input-folder finesse/scripts/input/ --output-folder finesse/scripts/output/ --file-name Finesse_questions_for_testing.xlsx --sheet-name "To fill"
```

Replace `'example.xlsx'` with the actual name of your input Excel file and
`'Sheet1'` with the name of the worksheet containing the data.

### Output

The script generates individual JSON files for each row of data in the specified
output folder. Each JSON file contains the following fields:

- `question`: The question extracted from the Excel file.
- `answer`: The answer extracted from the Excel file.
- `title`: The title(s) extracted from specified columns in the Excel file.
- `url`: The URL(s) extracted from specified columns in the Excel file.

Upon completion, the script prints "Conversion terminée !" (Conversion
completed!) to indicate that the conversion process is finished.
110 changes: 88 additions & 22 deletions finesse/accuracy_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,37 @@
OUTPUT_FOLDER = "./finesse/output"
AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"])

def calculate_accuracy(responses_url: list[str], expected_url: str) -> AccuracyResult:
def calculate_accuracy(responses_url: list[str], expected_url: list | str) -> AccuracyResult:
"""
Calculates the accuracy of the responses by comparing the URLs of the responses with the expected URL.
Args:
responses_url (list[str]): A list of URLs representing the responses.
expected_url (list[str] | str): The expected URL or a list of expected URLs.
Returns:
AccuracyResult: An object containing the position, total pages, and score of the accuracy calculation.
"""
position: int = 0
total_pages: int = len(responses_url)
score: float = 0.0
expected_number = int(re.findall(r'/(\d+)/', expected_url)[0])
expected_number = []

PATTERN = r'/(\d+)/'
if isinstance(expected_url, list):
for url in expected_url:
if url.startswith("https://inspection.canada.ca"):
number = int(re.findall(PATTERN, url)[0])
expected_number.append(number)
elif isinstance(expected_url, str) and expected_url.startswith("https://inspection.canada.ca"):
number = int(re.findall(PATTERN, expected_url)[0])
expected_number.append(number)

for idx, response_url in enumerate(responses_url):
if response_url.startswith("https://inspection.canada.ca"):
try:
response_number = int(re.findall(r'/(\d+)/', response_url)[0])
if response_number == expected_number:
response_number = int(re.findall(PATTERN, response_url)[0])
if response_number in expected_number:
position = idx
score = 1 - (position / total_pages)
score= round(score, 2)
Expand All @@ -42,7 +62,15 @@ def save_to_markdown(test_data: dict, engine: str):
md_file.write("| 📄 File | 💬 Question| 🔎 Finesse Accuracy Score | 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |⌛ Finesse Time | ⌛ Bing Time | ⌛ Filtered Bing Time |\n")
md_file.write("|---|---|---|---|---|---|---|---|\n")
for key, value in test_data.items():
md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {int(value.get('accuracy')*100)}% | {int(value.get('bing_accuracy')*100)}% |{int(value.get('bing_filtered_accuracy')*100)}% |{int(value.get('time'))}ms | {int(value.get('bing_time'))}ms | {int(value.get('bing_filtered_time'))}ms |\n")
question = ""
if isinstance(value.get("expected_page").get("url"), list):
question = f"{value.get('question')} "
for index, url in enumerate(value.get("expected_page").get("url")):
question += f"\| [Link{index+1}]({url}) "
question += "\|"
else:
question = f"[{value.get('question')}]({value.get('expected_page').get('url')})"
md_file.write(f"| {key} | {question} | {int(value.get('accuracy')*100)}% | {int(value.get('bing_accuracy')*100)}% |{int(value.get('bing_filtered_accuracy')*100)}% |{int(value.get('time'))}ms | {int(value.get('bing_time'))}ms | {int(value.get('bing_filtered_time'))}ms |\n")
md_file.write("\n")
md_file.write(f"Tested on {len(test_data)} files.\n\n")

Expand All @@ -64,6 +92,15 @@ def save_to_markdown(test_data: dict, engine: str):
md_file.write(f"| Top (100%)| {finesse_top} | {bing_top} |{bing_filtered_top} |\n")

def count_null_top_scores(accuracy_scores: dict):
"""
Counts the number of null scores and top scores in the given accuracy_scores dictionary.
Args:
accuracy_scores (dict): A dictionary containing accuracy scores.
Returns:
tuple: A tuple containing the count of null scores and top scores, respectively.
"""
null_scores = len([score for score in accuracy_scores.values() if score == 0])
top_scores = len([score for score in accuracy_scores.values() if score == 1])

Expand All @@ -77,25 +114,52 @@ def save_to_csv(test_data: dict, engine: str):
output_file = os.path.join(OUTPUT_FOLDER, file_name)
with open(output_file, "w", newline="") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["File", "Question", "Accuracy Score", "Time"])
writer.writerow(["File", "Question", "Finesse Accuracy Score", "Bing Accuracy Score", "Filtered Bing Accuracy Score", "Finesse Time", "Bing Time", "Filtered Bing Time"])
for key, value in test_data.items():
question = ""
if isinstance(value.get("expected_page").get("url"), list):
question = f"{value.get('question')} "
for index, url in enumerate(value.get("expected_page").get("url")):
question += f"[{index+1}]({url}) "
else:
question = f"[{value.get('question')}]({value.get('expected_page').get('url')})"
writer.writerow([
key,
value.get("question"),
f"{value.get('accuracy')}",
f"{int(value.get('time'))}"
question,
f"{int(value.get('accuracy')*100)}%",
f"{int(value.get('bing_accuracy')*100)}%",
f"{int(value.get('bing_filtered_accuracy')*100)}%",
f"{int(value.get('time'))}ms",
f"{int(value.get('bing_time'))}ms",
f"{int(value.get('bing_filtered_time'))}ms"
])
writer.writerow([])

time_stats, accuracy_stats, bing_stats = calculate_statistical_summary(test_data)
writer.writerow(["Statistic", "Time", "Accuracy Score"])
writer.writerow(["Mean", f"{int(time_stats.get('Mean'))}", f"{int(accuracy_stats.get('Mean'))}"])
writer.writerow(["Median", f"{int(time_stats.get('Median'))}", f"{int(accuracy_stats.get('Median'))}"])
writer.writerow(["Standard Deviation", f"{int(time_stats.get('Standard Deviation'))}", f"{int(accuracy_stats.get('Standard Deviation'))}"])
writer.writerow(["Maximum", f"{int(time_stats.get('Maximum'))}", f"{int(accuracy_stats.get('Maximum'))}"])
writer.writerow(["Minimum", f"{int(time_stats.get('Minimum'))}", f"{int(accuracy_stats.get('Minimum'))}"])
time_stats, accuracy_stats, bing_accuracy_stats, bing_time_stats, bing_filtered_accuracy_stats, bing_filtered_time_stats = calculate_statistical_summary(test_data)
writer.writerow(["Statistic", "Finesse Accuracy Score", "Bing Accuracy Score", "Filtered Bing Accuracy Score", "Finesse Time", "Bing Time", "Filtered Bing Time"])
writer.writerow(["Mean", f"{accuracy_stats.get('Mean')}%", f"{bing_accuracy_stats.get('Mean')}%", f"{bing_filtered_accuracy_stats.get('Mean')}%", f"{time_stats.get('Mean')}ms", f"{bing_time_stats.get('Mean')}ms", f"{bing_filtered_time_stats.get('Mean')}ms"])
writer.writerow(["Median", f"{accuracy_stats.get('Median')}%", f"{bing_accuracy_stats.get('Median')}%", f"{bing_filtered_accuracy_stats.get('Median')}%", f"{time_stats.get('Median')}ms", f"{bing_time_stats.get('Median')}ms", f"{bing_filtered_time_stats.get('Median')}ms"])
writer.writerow(["Standard Deviation", f"{accuracy_stats.get('Standard Deviation')}%", f"{bing_accuracy_stats.get('Standard Deviation')}%", f"{bing_filtered_accuracy_stats.get('Standard Deviation')}%", f"{time_stats.get('Standard Deviation')}ms", f"{bing_time_stats.get('Standard Deviation')}ms", f"{bing_filtered_time_stats.get('Standard Deviation')}ms"])
writer.writerow(["Maximum", f"{accuracy_stats.get('Maximum')}%", f"{bing_accuracy_stats.get('Maximum')}%", f"{bing_filtered_accuracy_stats.get('Maximum')}%", f"{time_stats.get('Maximum')}ms", f"{bing_time_stats.get('Maximum')}ms", f"{bing_filtered_time_stats.get('Maximum')}ms"])
writer.writerow(["Minimum", f"{accuracy_stats.get('Minimum')}%", f"{bing_accuracy_stats.get('Minimum')}%", f"{bing_filtered_accuracy_stats.get('Minimum')}%", f"{time_stats.get('Minimum')}ms", f"{bing_time_stats.get('Minimum')}ms", f"{bing_filtered_time_stats.get('Minimum')}ms"])

def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict, dict, dict, dict, dict]:
"""
Calculate the statistical summary of the test data.
Args:
test_data (dict): A dictionary containing the test data.
Returns:
tuple[dict, dict, dict, dict, dict, dict]: A tuple containing the statistical summary for different metrics.
The tuple contains the following dictionaries:
- time_stats: Statistical summary for the 'time' metric.
- accuracy_stats: Statistical summary for the 'accuracy' metric.
- bing_accuracy_stats: Statistical summary for the 'bing_accuracy' metric.
- bing_times_stats: Statistical summary for the 'bing_times' metric.
- bing_filtered_accuracy_stats: Statistical summary for the 'bing_filtered_accuracy' metric.
- bing_filtered_times_stats: Statistical summary for the 'bing_filtered_times' metric.
"""
def calculate_stats(data: list) -> dict:
stats = {
"Mean": statistics.mean(data),
Expand Down Expand Up @@ -142,30 +206,32 @@ def update_dict_bing_data(test_data: dict):
Args:
test_data (dict): The dictionary containing the test data.
"""
copy_data = test_data.copy()
load_dotenv()
endpoint = os.getenv("BING_ENDPOINT")
subscription_key = os.getenv("BING_SEARCH_KEY")
search_engine = BingSearch(endpoint, subscription_key)
count = 1
for key, value in test_data.items():
for key, value in copy_data.items():
question = value.get("question")
expected_url = value.get("expected_page").get("url")
top = value.get("top")
response_url, time_elapsed = search_engine.search_urls(question, top)
accuracy_result = calculate_accuracy(response_url, expected_url)
value["bing_accuracy"] = accuracy_result.score
value["bing_time"] = time_elapsed
test_data[key]["bing_accuracy"] = accuracy_result.score
test_data[key]["bing_time"] = time_elapsed
print(f"{count} files are done")
count += 1

print("Second Bing Search Test")
count = 1
for key, value in test_data.items():
for key, value in copy_data.items():
question = f"site:inspection.canada.ca {value.get('question')}"
expected_url = value.get("expected_page").get("url")
top = value.get("top")
response_url, time_elapsed = search_engine.search_urls(question, top)
accuracy_result = calculate_accuracy(response_url, expected_url)
value["bing_filtered_accuracy"] = accuracy_result.score
value["bing_filtered_time"] = time_elapsed
test_data[key]["bing_filtered_accuracy"] = accuracy_result.score
test_data[key]["bing_filtered_time"] = time_elapsed
print(f"{count} files are done")
count += 1
5 changes: 4 additions & 1 deletion finesse/bing_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def search_urls(self, query: str, num_results: int = 100) -> tuple[list[str], fl
elapsed_time.append(time.time() - start_time)
if hasattr(web_data, 'web_pages') and web_data.web_pages is not None:
urls.extend([item.url for item in web_data.web_pages.value])
offset += len([item.url for item in web_data.web_pages.value])
try:
offset += len([item.url for item in web_data.web_pages.value])
except AttributeError:
break
urls = urls[:num_results]
return urls, statistics.mean(elapsed_time) * 1000
23 changes: 14 additions & 9 deletions finesse/finesse_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from host import is_host_up

global_test_data = dict()
settings = dict()
class NoTestDataError(Exception):
"""Raised when all requests have failed and there is no test data"""

Expand Down Expand Up @@ -60,7 +61,6 @@ def search_accuracy(self):
response_url.append(page.get("url"))
accuracy_result = calculate_accuracy(response_url, expected_url)
time_taken = round(response.elapsed.total_seconds()*1000,3)

expected_page = json_data.copy()
del expected_page['question']
del expected_page['answer']
Expand All @@ -72,7 +72,7 @@ def search_accuracy(self):
"total_pages": accuracy_result.total_pages,
"accuracy": accuracy_result.score,
"time": time_taken,
"top": self.top
"top": self.top,
}

def on_start(self):
Expand All @@ -89,14 +89,19 @@ def __init__(self, *args, **kwargs):
self.format = self.environment.parsed_options.format
self.once = self.environment.parsed_options.once
self.top = self.environment.parsed_options.top
settings["engine"] = self.engine
settings["format"] = self.format
settings["once"] = self.once
settings["top"] = self.top
settings["path"] = self.path


@events.quitting.add_listener
def quitting(environment, **_kwargs):
@events.quit.add_listener
def quit(**_kwargs):
print("Search accuracy test completed")
print("Starting bing search test")

update_dict_bing_data(global_test_data)
if environment.parsed_options.format == "md":
save_to_markdown(global_test_data, environment.parsed_options.engine)
elif environment.parsed_options.format == "csv":
save_to_csv(global_test_data, environment.parsed_options.engine)
if settings.get("format") == "md":
save_to_markdown(global_test_data, "azure")
elif settings.get("format") == "csv":
save_to_csv(global_test_data, settings.get("engine"))
4 changes: 2 additions & 2 deletions finesse/jsonreader.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import json
from typing import Iterator
import os

from natsort import natsorted
class JSONReader(Iterator):
"Read test data from JSON files using an iterator"

def __init__(self, directory):
self.directory = directory
self.file_list = sorted([f for f in os.listdir(directory) if f.endswith('.json')])
self.file_list = natsorted([f for f in os.listdir(directory) if f.endswith('.json')])
if not self.file_list:
raise FileNotFoundError(f"No JSON files found in the directory '{directory}'")
self.current_file_index = 0
Expand Down
Loading

0 comments on commit 43698da

Please sign in to comment.