From cdc3f35a4ab58dc394ce2e398468267f595d8a8a Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Fri, 22 Mar 2024 09:39:33 -0400 Subject: [PATCH 01/15] issue #6: abenassi Google-Search-API --- finesse/google_search.py | 8 ++++++++ requirements.txt | 1 + tests/test_google_search.py | 13 +++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 finesse/google_search.py create mode 100644 tests/test_google_search.py diff --git a/finesse/google_search.py b/finesse/google_search.py new file mode 100644 index 0000000..6ed6322 --- /dev/null +++ b/finesse/google_search.py @@ -0,0 +1,8 @@ +from googleapi import google + +def get_google_search_urls(query: str, num_results: int = 100) -> list[str]: + links = [] + search_results = google.search(query, num_results) + print(search_results[0].google_link ) + links.append(search_results[0].google_link ) + return links diff --git a/requirements.txt b/requirements.txt index ff525a0..cbdda38 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ locust regex +git+https://github.com/abenassi/Google-Search-API diff --git a/tests/test_google_search.py b/tests/test_google_search.py new file mode 100644 index 0000000..901d99d --- /dev/null +++ b/tests/test_google_search.py @@ -0,0 +1,13 @@ +import unittest +from finesse.google_search import get_google_search_urls + +class TestGoogleSearch(unittest.TestCase): + def test_get_google_search_urls(self): + query = "Canada Food Inspection Agency" + num_results = 10 + urls = get_google_search_urls(query, num_results) + self.assertEqual(len(urls), num_results) + self.assertTrue(all(url.startswith("http") for url in urls)) + +if __name__ == "__main__": + unittest.main() From 699e5ca939224b176d25034fc95853b5703d65a4 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Fri, 22 Mar 2024 10:04:03 -0400 Subject: [PATCH 02/15] issue #6: Nv7-GitHub googlesearch --- finesse/google_search.py | 7 +++---- requirements.txt | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/finesse/google_search.py b/finesse/google_search.py index 6ed6322..eb9f5ed 100644 --- a/finesse/google_search.py +++ b/finesse/google_search.py @@ -1,8 +1,7 @@ -from googleapi import google +from googlesearch import search def get_google_search_urls(query: str, num_results: int = 100) -> list[str]: links = [] - search_results = google.search(query, num_results) - print(search_results[0].google_link ) - links.append(search_results[0].google_link ) + for url in search(query, num_results, sleep_interval=1): + links.append(url) return links diff --git a/requirements.txt b/requirements.txt index cbdda38..62475fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ locust regex -git+https://github.com/abenassi/Google-Search-API +googlesearch-python From 7e5557bbb424c069b2e3b1e15230c71112f46a3f Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Fri, 22 Mar 2024 11:39:55 -0400 Subject: [PATCH 03/15] issue #6: bing search by scrapping --- finesse/bing_search.py | 44 +++++++++++++++++++ finesse/google_search.py | 7 --- ...t_google_search.py => test_bing_search.py} | 6 +-- 3 files changed, 47 insertions(+), 10 deletions(-) create mode 100644 finesse/bing_search.py delete mode 100644 finesse/google_search.py rename tests/{test_google_search.py => test_bing_search.py} (65%) diff --git a/finesse/bing_search.py b/finesse/bing_search.py new file mode 100644 index 0000000..1876709 --- /dev/null +++ b/finesse/bing_search.py @@ -0,0 +1,44 @@ +import requests +import random + +def get_bing_search_urls(query: str, num_results: int = 100) -> list[str]: + urls = [] + headers = {'User-Agent': get_useragent()} + cookies = get_cookies() + url = f"https://www.google.com/search?q={query}&num={num_results}" + res = requests.get(url, headers=headers, cookies=cookies) + if res.status_code == 200: + urls.append(res.url) + else: + raise requests.exceptions.HTTPError(res.status_code, res.url) + return urls + +def get_cookies(): + """ + Generates cookies to avoid getting blocked during search. + Returns: + dict: A dictionary containing the cookies. + + Raises: + requests.exceptions.HTTPError: If the response status code is not 200. + """ + trend_url = 'https://youtube.com' + response = requests.get(trend_url) + if response.status_code == 200: + return response.cookies.get_dict() + else: + raise requests.exceptions.HTTPError(f'Cookies raised {response.status_code}') + + + +def get_useragent(): + USERAGENT_LIST = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' + ] + return random.choice(USERAGENT_LIST) diff --git a/finesse/google_search.py b/finesse/google_search.py deleted file mode 100644 index eb9f5ed..0000000 --- a/finesse/google_search.py +++ /dev/null @@ -1,7 +0,0 @@ -from googlesearch import search - -def get_google_search_urls(query: str, num_results: int = 100) -> list[str]: - links = [] - for url in search(query, num_results, sleep_interval=1): - links.append(url) - return links diff --git a/tests/test_google_search.py b/tests/test_bing_search.py similarity index 65% rename from tests/test_google_search.py rename to tests/test_bing_search.py index 901d99d..6342b8a 100644 --- a/tests/test_google_search.py +++ b/tests/test_bing_search.py @@ -1,11 +1,11 @@ import unittest -from finesse.google_search import get_google_search_urls +from finesse.bing_search import get_bing_search_urls -class TestGoogleSearch(unittest.TestCase): +class TestBingSearch(unittest.TestCase): def test_get_google_search_urls(self): query = "Canada Food Inspection Agency" num_results = 10 - urls = get_google_search_urls(query, num_results) + urls = get_bing_search_urls(query, num_results) self.assertEqual(len(urls), num_results) self.assertTrue(all(url.startswith("http") for url in urls)) From b9275f2055bc9dc24784155454eca4f27edeb959 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Fri, 22 Mar 2024 12:45:03 -0400 Subject: [PATCH 04/15] issue #6: google_search completed --- finesse/accuracy_functions.py | 13 +++--- finesse/bing_search.py | 44 ------------------- finesse/finesse_test.py | 4 ++ finesse/google_search.py | 18 ++++++++ ...t_bing_search.py => test_google_search.py} | 6 +-- 5 files changed, 32 insertions(+), 53 deletions(-) delete mode 100644 finesse/bing_search.py create mode 100644 finesse/google_search.py rename tests/{test_bing_search.py => test_google_search.py} (65%) diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py index f55542f..84de5da 100644 --- a/finesse/accuracy_functions.py +++ b/finesse/accuracy_functions.py @@ -15,12 +15,13 @@ def calculate_accuracy(responses_url: list[str], expected_url: str) -> AccuracyR expected_number = int(re.findall(r'/(\d+)/', expected_url)[0]) for idx, response_url in enumerate(responses_url): - response_number = int(re.findall(r'/(\d+)/', response_url)[0]) - if response_number == expected_number: - position = idx - score = 1 - (position / total_pages) - score= round(score, 2) - break + if response_url.startswith("https://inspection.canada.ca"): + response_number = int(re.findall(r'/(\d+)/', response_url)[0]) + if response_number == expected_number: + position = idx + score = 1 - (position / total_pages) + score= round(score, 2) + break return AccuracyResult(position, total_pages, score) diff --git a/finesse/bing_search.py b/finesse/bing_search.py deleted file mode 100644 index 1876709..0000000 --- a/finesse/bing_search.py +++ /dev/null @@ -1,44 +0,0 @@ -import requests -import random - -def get_bing_search_urls(query: str, num_results: int = 100) -> list[str]: - urls = [] - headers = {'User-Agent': get_useragent()} - cookies = get_cookies() - url = f"https://www.google.com/search?q={query}&num={num_results}" - res = requests.get(url, headers=headers, cookies=cookies) - if res.status_code == 200: - urls.append(res.url) - else: - raise requests.exceptions.HTTPError(res.status_code, res.url) - return urls - -def get_cookies(): - """ - Generates cookies to avoid getting blocked during search. - Returns: - dict: A dictionary containing the cookies. - - Raises: - requests.exceptions.HTTPError: If the response status code is not 200. - """ - trend_url = 'https://youtube.com' - response = requests.get(trend_url) - if response.status_code == 200: - return response.cookies.get_dict() - else: - raise requests.exceptions.HTTPError(f'Cookies raised {response.status_code}') - - - -def get_useragent(): - USERAGENT_LIST = [ - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' - ] - return random.choice(USERAGENT_LIST) diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py index 46f200d..aab472e 100644 --- a/finesse/finesse_test.py +++ b/finesse/finesse_test.py @@ -4,6 +4,7 @@ import json from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy from host import is_host_up +from google_search import get_google_search_urls class NoTestDataError(Exception): """Raised when all requests have failed and there is no test data""" @@ -54,6 +55,8 @@ def search_accuracy(self): for page in response_pages: response_url.append(page.get("url")) accuracy_result = calculate_accuracy(response_url, expected_url) + google_response_url = get_google_search_urls(question) + google_accuracy_result = calculate_accuracy(google_response_url, expected_url) time_taken = round(response.elapsed.microseconds/1000,3) expected_page = json_data.copy() @@ -66,6 +69,7 @@ def search_accuracy(self): "position": accuracy_result.position, "total_pages": accuracy_result.total_pages, "accuracy": accuracy_result.score, + "google_accuracy": google_accuracy_result, "time": time_taken, } diff --git a/finesse/google_search.py b/finesse/google_search.py new file mode 100644 index 0000000..3abd404 --- /dev/null +++ b/finesse/google_search.py @@ -0,0 +1,18 @@ +from googlesearch import search + +def get_google_search_urls(query: str, num_results: int = 100) -> list[str]: + """ + Retrieves a list of Google search result URLs for the given query. + + Args: + query (str): The search query. + num_results (int, optional): The number of search results to retrieve. Defaults to 100. + + Returns: + list[str]: A list of URLs representing the search results. + """ + num_results -= 2 # 2 extra urls are added by googlesearch library + links = [] + for url in search(query, num_results, sleep_interval=1): + links.append(url) + return links diff --git a/tests/test_bing_search.py b/tests/test_google_search.py similarity index 65% rename from tests/test_bing_search.py rename to tests/test_google_search.py index 6342b8a..901d99d 100644 --- a/tests/test_bing_search.py +++ b/tests/test_google_search.py @@ -1,11 +1,11 @@ import unittest -from finesse.bing_search import get_bing_search_urls +from finesse.google_search import get_google_search_urls -class TestBingSearch(unittest.TestCase): +class TestGoogleSearch(unittest.TestCase): def test_get_google_search_urls(self): query = "Canada Food Inspection Agency" num_results = 10 - urls = get_bing_search_urls(query, num_results) + urls = get_google_search_urls(query, num_results) self.assertEqual(len(urls), num_results) self.assertTrue(all(url.startswith("http") for url in urls)) From 678518b031934a3622b324f0e9fb7393b2aa9893 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Thu, 21 Mar 2024 17:32:20 -0400 Subject: [PATCH 05/15] issue #7: Removed punction mark on md files, added total number of 0, better rounding, sorted json files --- finesse/FINESSE_USAGE.md | 2 +- finesse/accuracy_functions.py | 33 +++++++++++++++++---------------- finesse/finesse_test.py | 4 ++++ finesse/jsonreader.py | 2 +- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/finesse/FINESSE_USAGE.md b/finesse/FINESSE_USAGE.md index fb58071..ad0b242 100644 --- a/finesse/FINESSE_USAGE.md +++ b/finesse/FINESSE_USAGE.md @@ -72,7 +72,7 @@ sequenceDiagram ## Example Command ```cmd -$locust -f finesse/finesse_test.py --engine azure --path finesse/QnA/good_question --host https://finesse-guidance.ninebasetwo.xyz/api --once +$locust -f finesse/finesse_test.py --engine azure --path finesse/QnA/sorted-2024-02-22/ --host https://finesse.inspection.alpha.canada.ca/api --once Searching with Azure Search... File: qna_2023-12-08_36.json diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py index 84de5da..e0bcccf 100644 --- a/finesse/accuracy_functions.py +++ b/finesse/accuracy_functions.py @@ -37,7 +37,7 @@ def save_to_markdown(test_data: dict, engine: str): md_file.write("| πŸ“„ File | πŸ’¬ Question | πŸ“ Accuracy Score | βŒ› Time |\n") md_file.write("|--------------------|-------------------------------------------------------------------------------------------------------------------------|----------------|----------|\n") for key, value in test_data.items(): - md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')})' | {value.get('accuracy')*100:.1f}% | {value.get('time')}ms |\n") + md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100:.0f}% | {int(value.get('time'))}ms |\n") md_file.write("\n") md_file.write(f"Tested on {len(test_data)} files.\n\n") @@ -45,11 +45,12 @@ def save_to_markdown(test_data: dict, engine: str): md_file.write("## Statistical summary\n\n") md_file.write("| Statistic | Time | Accuracy score|\n") md_file.write("|-----------------------|------------|---------|\n") - md_file.write(f"|Mean| {time_stats.get('Mean')}ms | {accuracy_stats.get('Mean')*100}% |\n") - md_file.write(f"|Median| {time_stats.get('Median')}ms | {accuracy_stats.get('Median')*100}% |\n") - md_file.write(f"|Standard Deviation| {time_stats.get('Standard Deviation')}ms | {accuracy_stats.get('Standard Deviation')*100}% |\n") - md_file.write(f"|Maximum| {time_stats.get('Maximum')}ms | {accuracy_stats.get('Maximum')*100}% |\n") - md_file.write(f"|Minimum| {time_stats.get('Minimum')}ms | {accuracy_stats.get('Minimum')*100}% |\n") + md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |\n") + md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% |\n") + md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% |\n") + md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% |\n") + md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% |\n") + md_file.write(f"\nThere are a total of {len([result.get('accuracy') for result in test_data.values() if result.get('accuracy') == 0])} null scores\n") def save_to_csv(test_data: dict, engine: str): if not os.path.exists(OUTPUT_FOLDER): @@ -65,35 +66,35 @@ def save_to_csv(test_data: dict, engine: str): key, value.get("question"), f"{value.get('accuracy')}", - f"{value.get('time')}" + f"{int(value.get('time'))}" ]) writer.writerow([]) time_stats, accuracy_stats = calculate_statistical_summary(test_data) writer.writerow(["Statistic", "Time", "Accuracy Score"]) - writer.writerow(["Mean", f"{time_stats.get('Mean')}", f"{accuracy_stats.get('Mean')}"]) - writer.writerow(["Median", f"{time_stats.get('Median')}", f"{accuracy_stats.get('Median')}"]) - writer.writerow(["Standard Deviation", f"{time_stats.get('Standard Deviation')}", f"{accuracy_stats.get('Standard Deviation')}"]) - writer.writerow(["Maximum", f"{time_stats.get('Maximum')}", f"{accuracy_stats.get('Maximum')}"]) - writer.writerow(["Minimum", f"{time_stats.get('Minimum')}", f"{accuracy_stats.get('Minimum')}"]) + writer.writerow(["Mean", f"{int(time_stats.get('Mean'))}", f"{int(accuracy_stats.get('Mean'))}"]) + writer.writerow(["Median", f"{int(time_stats.get('Median'))}", f"{int(accuracy_stats.get('Median'))}"]) + writer.writerow(["Standard Deviation", f"{int(time_stats.get('Standard Deviation'))}", f"{int(accuracy_stats.get('Standard Deviation'))}"]) + writer.writerow(["Maximum", f"{int(time_stats.get('Maximum'))}", f"{int(accuracy_stats.get('Maximum'))}"]) + writer.writerow(["Minimum", f"{int(time_stats.get('Minimum'))}", f"{int(accuracy_stats.get('Minimum'))}"]) def log_data(test_data: dict): for key, value in test_data.items(): print("File:", key) print("Question:", value.get("question")) print("Expected URL:", value.get("expected_page").get("url")) - print(f'Accuracy Score: {value.get("accuracy")*100}%') - print(f'Time: {value.get("time")}ms') + print(f'Accuracy Score: {int(value.get("accuracy")*100)}%') + print(f'Time: {int(value.get("time"))}ms') print() time_stats, accuracy_stats = calculate_statistical_summary(test_data) print("---") print(f"Tested on {len(test_data)} files.") print("Time statistical summary:", end="\n ") for key,value in time_stats.items(): - print(f"{key}:{value},", end=' ') + print(f"{key}:{int(value)},", end=' ') print("\nAccuracy statistical summary:", end="\n ") for key,value in accuracy_stats.items(): - print(f"{key}:{value*100}%,", end=' ') + print(f"{key}:{int(value*100)}%,", end=' ') print("\n---") diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py index aab472e..2670632 100644 --- a/finesse/finesse_test.py +++ b/finesse/finesse_test.py @@ -30,11 +30,15 @@ class FinesseUser(HttpUser): def search_accuracy(self): try: json_data = next(self.qna_reader) + while json_data.get("skip") == True: + json_data = next(self.qna_reader) except StopIteration: if not self.once: # Reset variables self.on_start() json_data = next(self.qna_reader) + while json_data.get("skip") == True: + json_data = next(self.qna_reader) print("Restarting the running test") else: print("Stopping the running test") diff --git a/finesse/jsonreader.py b/finesse/jsonreader.py index 2c4c23f..8f1a8ef 100644 --- a/finesse/jsonreader.py +++ b/finesse/jsonreader.py @@ -7,7 +7,7 @@ class JSONReader(Iterator): def __init__(self, directory): self.directory = directory - self.file_list = [f for f in os.listdir(directory) if f.endswith('.json')] + self.file_list = sorted([f for f in os.listdir(directory) if f.endswith('.json')]) if not self.file_list: raise FileNotFoundError(f"No JSON files found in the directory '{directory}'") self.current_file_index = 0 From a7156cf81013aa5f12b5c2e0a067ab1b068f1166 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Thu, 21 Mar 2024 17:37:52 -0400 Subject: [PATCH 06/15] issue #7: fix ruff check --- finesse/finesse_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py index 2670632..ecfc8ba 100644 --- a/finesse/finesse_test.py +++ b/finesse/finesse_test.py @@ -30,14 +30,14 @@ class FinesseUser(HttpUser): def search_accuracy(self): try: json_data = next(self.qna_reader) - while json_data.get("skip") == True: + while json_data.get("skip") is True: json_data = next(self.qna_reader) except StopIteration: if not self.once: # Reset variables self.on_start() json_data = next(self.qna_reader) - while json_data.get("skip") == True: + while json_data.get("skip") is True: json_data = next(self.qna_reader) print("Restarting the running test") else: From c78d984fbd6beb5a16b9079557ad60b2783a0ced Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Fri, 22 Mar 2024 12:45:03 -0400 Subject: [PATCH 07/15] issue #6: google_search completed --- finesse/accuracy_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py index e0bcccf..ea5217c 100644 --- a/finesse/accuracy_functions.py +++ b/finesse/accuracy_functions.py @@ -37,7 +37,7 @@ def save_to_markdown(test_data: dict, engine: str): md_file.write("| πŸ“„ File | πŸ’¬ Question | πŸ“ Accuracy Score | βŒ› Time |\n") md_file.write("|--------------------|-------------------------------------------------------------------------------------------------------------------------|----------------|----------|\n") for key, value in test_data.items(): - md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100:.0f}% | {int(value.get('time'))}ms |\n") + md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {int(value.get('time'))}ms |\n") md_file.write("\n") md_file.write(f"Tested on {len(test_data)} files.\n\n") From 3b2dff028e5332060380f1bc12cc9ce3332e45b2 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Fri, 22 Mar 2024 16:11:23 -0400 Subject: [PATCH 08/15] issue #6: google api incorporation --- .env.template | 2 ++ finesse/accuracy_functions.py | 47 +++++++++++++++++++++++++++-------- finesse/finesse_test.py | 11 ++++---- finesse/google_search.py | 25 ++++++++++++++----- requirements.txt | 3 ++- tests/test_google_search.py | 6 ++--- 6 files changed, 69 insertions(+), 25 deletions(-) create mode 100644 .env.template diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..1fc0389 --- /dev/null +++ b/.env.template @@ -0,0 +1,2 @@ +GOOGLE_API_KEY = +GOOGLE_CSE_ID = diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py index ea5217c..341422d 100644 --- a/finesse/accuracy_functions.py +++ b/finesse/accuracy_functions.py @@ -4,6 +4,7 @@ import os from collections import namedtuple import regex as re +from finesse.google_search import search_google_urls OUTPUT_FOLDER = "./finesse/output" AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"]) @@ -34,22 +35,22 @@ def save_to_markdown(test_data: dict, engine: str): with open(output_file, "w") as md_file: md_file.write(f"# Test on the {engine} search engine: {date_string}\n\n") md_file.write("## Test data table\n\n") - md_file.write("| πŸ“„ File | πŸ’¬ Question | πŸ“ Accuracy Score | βŒ› Time |\n") + md_file.write("| πŸ“„ File | πŸ’¬ Question | πŸ“ Accuracy Score | 🌐 Google Score |βŒ› Time |\n") md_file.write("|--------------------|-------------------------------------------------------------------------------------------------------------------------|----------------|----------|\n") for key, value in test_data.items(): - md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {int(value.get('time'))}ms |\n") + md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {value.get('google_accuracy')*100}% |{int(value.get('time'))}ms |\n") md_file.write("\n") md_file.write(f"Tested on {len(test_data)} files.\n\n") - time_stats, accuracy_stats = calculate_statistical_summary(test_data) + time_stats, accuracy_stats, google_stats = calculate_statistical_summary(test_data) md_file.write("## Statistical summary\n\n") - md_file.write("| Statistic | Time | Accuracy score|\n") + md_file.write("| Statistic | βŒ› Time | πŸ“ Accuracy score| 🌐 Google Score |\n") md_file.write("|-----------------------|------------|---------|\n") - md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |\n") - md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% |\n") - md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% |\n") - md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% |\n") - md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% |\n") + md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |{int(google_stats.get('Mean')*100)}% |\n") + md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% | {int(google_stats.get('Median')*100)}% |\n") + md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% | {int(google_stats.get('Standard Deviation')*100)}% |\n") + md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% | {int(google_stats.get('Maximum')*100)}% |\n") + md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% | {int(google_stats.get('Minimum')*100)}% |\n") md_file.write(f"\nThere are a total of {len([result.get('accuracy') for result in test_data.values() if result.get('accuracy') == 0])} null scores\n") def save_to_csv(test_data: dict, engine: str): @@ -101,6 +102,7 @@ def log_data(test_data: dict): def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]: times = [result.get("time") for result in test_data.values()] accuracies = [result.get("accuracy") for result in test_data.values()] + google = [result.get("google_accuracy") for result in test_data.values()] time_stats = { "Mean": round(statistics.mean(times), 3), "Median": round(statistics.median(times), 3), @@ -115,4 +117,29 @@ def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]: "Maximum": round(max(accuracies), 2), "Minimum": round(min(accuracies), 2), } - return time_stats, accuracy_stats + google_stats= { + "Mean": round(statistics.mean(google), 2), + "Median": round(statistics.median(google), 2), + "Standard Deviation": round(statistics.stdev(google), 2), + "Maximum": round(max(google), 2), + "Minimum": round(min(google), 2), + } + return time_stats, accuracy_stats, google_stats + +def update_dict_google_data(test_data: dict): + """ + Updates the given test_data dictionary with the Google accuracy results. + + Args: + test_data (dict): The dictionary containing the test data. + """ + count = 0 + for key, value in test_data.items(): + question = value.get("question") + expected_url = value.get("expected_page").get("url") + top = value.get("top") + google_response_url = search_google_urls(question, top) + google_accuracy_result = calculate_accuracy(google_response_url, expected_url) + value["google_accuracy"] = google_accuracy_result.score + count += 1 + print(f"{count} file is done") diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py index ecfc8ba..2cacb64 100644 --- a/finesse/finesse_test.py +++ b/finesse/finesse_test.py @@ -2,9 +2,8 @@ from jsonreader import JSONReader import os import json -from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy +from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy, update_dict_google_data from host import is_host_up -from google_search import get_google_search_urls class NoTestDataError(Exception): """Raised when all requests have failed and there is no test data""" @@ -59,8 +58,6 @@ def search_accuracy(self): for page in response_pages: response_url.append(page.get("url")) accuracy_result = calculate_accuracy(response_url, expected_url) - google_response_url = get_google_search_urls(question) - google_accuracy_result = calculate_accuracy(google_response_url, expected_url) time_taken = round(response.elapsed.microseconds/1000,3) expected_page = json_data.copy() @@ -73,8 +70,8 @@ def search_accuracy(self): "position": accuracy_result.position, "total_pages": accuracy_result.total_pages, "accuracy": accuracy_result.score, - "google_accuracy": google_accuracy_result, "time": time_taken, + "top": self.top } def on_start(self): @@ -85,6 +82,10 @@ def on_stop(self): if not self.qna_results: raise NoTestDataError + print("Search accuracy test completed") + print("Starting google search test") + + update_dict_google_data(self.qna_results) log_data(self.qna_results) if self.format == "md": save_to_markdown(self.qna_results, self.engine) diff --git a/finesse/google_search.py b/finesse/google_search.py index 3abd404..05271bf 100644 --- a/finesse/google_search.py +++ b/finesse/google_search.py @@ -1,8 +1,15 @@ -from googlesearch import search +from googleapiclient.discovery import build +from dotenv import load_dotenv +import os -def get_google_search_urls(query: str, num_results: int = 100) -> list[str]: +def google_search(search_term, api_key, cse_id, **kwargs): + service = build("customsearch", "v1", developerKey=api_key) + res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute() + return res['items'] + +def search_google_urls(query: str, num_results: int = 100) -> list[str]: """ - Retrieves a list of Google search result URLs for the given query. + Retrieves a list of Google search result URLs for the given query using the Google API. Args: query (str): The search query. @@ -10,9 +17,15 @@ def get_google_search_urls(query: str, num_results: int = 100) -> list[str]: Returns: list[str]: A list of URLs representing the search results. + + Raises: + Exception: If the request limit is exceeded (error 429 Too Many Requests). """ - num_results -= 2 # 2 extra urls are added by googlesearch library + load_dotenv() links = [] - for url in search(query, num_results, sleep_interval=1): - links.append(url) + api_key = os.getenv("GOOGLE_API_KEY") + cse_id = os.getenv("GOOGLE_CSE_ID") + results = google_search(query, api_key, cse_id, start=11) + for item in results: + links.append(item['link']) return links diff --git a/requirements.txt b/requirements.txt index 62475fc..8acc2a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ locust regex -googlesearch-python +google-api-python-client +python-dotenv diff --git a/tests/test_google_search.py b/tests/test_google_search.py index 901d99d..0285042 100644 --- a/tests/test_google_search.py +++ b/tests/test_google_search.py @@ -1,11 +1,11 @@ import unittest -from finesse.google_search import get_google_search_urls +from finesse.google_search import search_google_urls class TestGoogleSearch(unittest.TestCase): def test_get_google_search_urls(self): query = "Canada Food Inspection Agency" - num_results = 10 - urls = get_google_search_urls(query, num_results) + num_results = 100 + urls = search_google_urls(query, num_results) self.assertEqual(len(urls), num_results) self.assertTrue(all(url.startswith("http") for url in urls)) From 508151b24187517c1f924bbf89df103cec4a43fa Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Mon, 25 Mar 2024 12:28:09 -0400 Subject: [PATCH 09/15] issue #6: Bing Search works --- .env.template | 4 +-- finesse/accuracy_functions.py | 4 +-- finesse/bing_search.py | 28 +++++++++++++++++ finesse/google_search.py | 31 ------------------- requirements.txt | 1 - ...t_google_search.py => test_bing_search.py} | 8 ++--- 6 files changed, 36 insertions(+), 40 deletions(-) create mode 100644 finesse/bing_search.py delete mode 100644 finesse/google_search.py rename tests/{test_google_search.py => test_bing_search.py} (57%) diff --git a/.env.template b/.env.template index 1fc0389..3c3d4c2 100644 --- a/.env.template +++ b/.env.template @@ -1,2 +1,2 @@ -GOOGLE_API_KEY = -GOOGLE_CSE_ID = +BING_SEARCH_KEY = +BING_ENDPOINT = diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py index 341422d..9d95b2e 100644 --- a/finesse/accuracy_functions.py +++ b/finesse/accuracy_functions.py @@ -4,7 +4,7 @@ import os from collections import namedtuple import regex as re -from finesse.google_search import search_google_urls +from finesse.bing_search import search_bing_urls OUTPUT_FOLDER = "./finesse/output" AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"]) @@ -138,7 +138,7 @@ def update_dict_google_data(test_data: dict): question = value.get("question") expected_url = value.get("expected_page").get("url") top = value.get("top") - google_response_url = search_google_urls(question, top) + google_response_url = search_bing_urls(question, top) google_accuracy_result = calculate_accuracy(google_response_url, expected_url) value["google_accuracy"] = google_accuracy_result.score count += 1 diff --git a/finesse/bing_search.py b/finesse/bing_search.py new file mode 100644 index 0000000..1013780 --- /dev/null +++ b/finesse/bing_search.py @@ -0,0 +1,28 @@ + +import os +from pprint import pprint +import requests +from dotenv import load_dotenv +import os + +def search_bing_urls(query: str, num_results: int = 100) -> list[str]: + load_dotenv() + urls = [] + endpoint = os.getenv("BING_ENDPOINT") + "/v7.0/search" + subscription_key = os.getenv("BING_SEARCH_KEY") + mkt = 'en-US' + params = { 'q': query, 'mkt': mkt, 'count': 50 } + headers = { 'Ocp-Apim-Subscription-Key': subscription_key } + # Call the API + try: + response = requests.get(endpoint, headers=headers, params=params) + response.raise_for_status() + + print("\nHeaders:\n") + print(response.headers) + + print("\nJSON Response:\n") + pprint(response.json()) + + except Exception as ex: + raise ex diff --git a/finesse/google_search.py b/finesse/google_search.py deleted file mode 100644 index 05271bf..0000000 --- a/finesse/google_search.py +++ /dev/null @@ -1,31 +0,0 @@ -from googleapiclient.discovery import build -from dotenv import load_dotenv -import os - -def google_search(search_term, api_key, cse_id, **kwargs): - service = build("customsearch", "v1", developerKey=api_key) - res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute() - return res['items'] - -def search_google_urls(query: str, num_results: int = 100) -> list[str]: - """ - Retrieves a list of Google search result URLs for the given query using the Google API. - - Args: - query (str): The search query. - num_results (int, optional): The number of search results to retrieve. Defaults to 100. - - Returns: - list[str]: A list of URLs representing the search results. - - Raises: - Exception: If the request limit is exceeded (error 429 Too Many Requests). - """ - load_dotenv() - links = [] - api_key = os.getenv("GOOGLE_API_KEY") - cse_id = os.getenv("GOOGLE_CSE_ID") - results = google_search(query, api_key, cse_id, start=11) - for item in results: - links.append(item['link']) - return links diff --git a/requirements.txt b/requirements.txt index 8acc2a0..71f973f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ locust regex -google-api-python-client python-dotenv diff --git a/tests/test_google_search.py b/tests/test_bing_search.py similarity index 57% rename from tests/test_google_search.py rename to tests/test_bing_search.py index 0285042..a5cc355 100644 --- a/tests/test_google_search.py +++ b/tests/test_bing_search.py @@ -1,11 +1,11 @@ import unittest -from finesse.google_search import search_google_urls +from finesse.bing_search import search_bing_urls -class TestGoogleSearch(unittest.TestCase): - def test_get_google_search_urls(self): +class TestBingSearch(unittest.TestCase): + def test_get_bing_search_urls(self): query = "Canada Food Inspection Agency" num_results = 100 - urls = search_google_urls(query, num_results) + urls = search_bing_urls(query, num_results) self.assertEqual(len(urls), num_results) self.assertTrue(all(url.startswith("http") for url in urls)) From 478660d42c615aa11b8e4a4dc65e9cfbfbbcbbc7 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Tue, 26 Mar 2024 15:50:38 -0400 Subject: [PATCH 10/15] issue #6: Refactoring + Bing Search + Bing Filtered Search --- finesse/accuracy_functions.py | 170 ++++++++++++++++++++-------------- finesse/bing_search.py | 59 +++++++----- finesse/finesse_test.py | 31 ++++--- requirements.txt | 2 + 4 files changed, 151 insertions(+), 111 deletions(-) diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py index 9d95b2e..1ab3ab9 100644 --- a/finesse/accuracy_functions.py +++ b/finesse/accuracy_functions.py @@ -4,7 +4,8 @@ import os from collections import namedtuple import regex as re -from finesse.bing_search import search_bing_urls +from finesse.bing_search import BingSearch +from dotenv import load_dotenv OUTPUT_FOLDER = "./finesse/output" AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"]) @@ -17,12 +18,15 @@ def calculate_accuracy(responses_url: list[str], expected_url: str) -> AccuracyR for idx, response_url in enumerate(responses_url): if response_url.startswith("https://inspection.canada.ca"): - response_number = int(re.findall(r'/(\d+)/', response_url)[0]) - if response_number == expected_number: - position = idx - score = 1 - (position / total_pages) - score= round(score, 2) - break + try: + response_number = int(re.findall(r'/(\d+)/', response_url)[0]) + if response_number == expected_number: + position = idx + score = 1 - (position / total_pages) + score= round(score, 2) + break + except IndexError: + pass return AccuracyResult(position, total_pages, score) @@ -35,23 +39,35 @@ def save_to_markdown(test_data: dict, engine: str): with open(output_file, "w") as md_file: md_file.write(f"# Test on the {engine} search engine: {date_string}\n\n") md_file.write("## Test data table\n\n") - md_file.write("| πŸ“„ File | πŸ’¬ Question | πŸ“ Accuracy Score | 🌐 Google Score |βŒ› Time |\n") - md_file.write("|--------------------|-------------------------------------------------------------------------------------------------------------------------|----------------|----------|\n") + md_file.write("| πŸ“„ File | πŸ’¬ Question| πŸ”Ž Finesse Accuracy Score | 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |βŒ› Finesse Time | βŒ› Bing Time | βŒ› Filtered Bing Time |\n") + md_file.write("|---|---|---|---|---|---|---|---|\n") for key, value in test_data.items(): - md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {value.get('google_accuracy')*100}% |{int(value.get('time'))}ms |\n") + md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {int(value.get('accuracy')*100)}% | {int(value.get('bing_accuracy')*100)}% |{int(value.get('bing_filtered_accuracy')*100)}% |{int(value.get('time'))}ms | {int(value.get('bing_time'))}ms | {int(value.get('bing_filtered_time'))}ms |\n") md_file.write("\n") md_file.write(f"Tested on {len(test_data)} files.\n\n") - time_stats, accuracy_stats, google_stats = calculate_statistical_summary(test_data) + time_stats, accuracy_stats, bing_accuracy_stats, bing_time_stats, bing_filtered_accuracy_stats, bing_filtered_time_stats = calculate_statistical_summary(test_data) md_file.write("## Statistical summary\n\n") - md_file.write("| Statistic | βŒ› Time | πŸ“ Accuracy score| 🌐 Google Score |\n") - md_file.write("|-----------------------|------------|---------|\n") - md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |{int(google_stats.get('Mean')*100)}% |\n") - md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% | {int(google_stats.get('Median')*100)}% |\n") - md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% | {int(google_stats.get('Standard Deviation')*100)}% |\n") - md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% | {int(google_stats.get('Maximum')*100)}% |\n") - md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% | {int(google_stats.get('Minimum')*100)}% |\n") - md_file.write(f"\nThere are a total of {len([result.get('accuracy') for result in test_data.values() if result.get('accuracy') == 0])} null scores\n") + md_file.write("| Statistic\Engine | πŸ”Ž Finesse Accuracy score| 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |βŒ› Finesse Time | βŒ› Bing Time | βŒ› Filtered Bing Time |\n") + md_file.write("|---|---|---|---|---|---|---|\n") + for stat in ["Mean", "Median", "Standard Deviation", "Maximum", "Minimum"]: + md_file.write(f"|{stat}| {accuracy_stats.get(stat)}% | {bing_accuracy_stats.get(stat)}% | {bing_filtered_accuracy_stats.get(stat)}% |{time_stats.get(stat)}ms | {bing_time_stats.get(stat)}ms | {bing_filtered_time_stats.get(stat)}ms |\n") + + md_file.write("\n## Count of null and top scores\n\n") + md_file.write("| Score\Engine | πŸ”Ž Finesse Accuracy score| 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |\n") + md_file.write("|---|---|---|---|\n") + finesse_null, finesse_top = count_null_top_scores({key: value.get("accuracy") for key, value in test_data.items()}) + bing_null, bing_top = count_null_top_scores({key: value.get("bing_accuracy") for key, value in test_data.items()}) + bing_filtered_null, bing_filtered_top = count_null_top_scores({key: value.get("bing_filtered_accuracy") for key, value in test_data.items()}) + + md_file.write(f"| Null (0%) | {finesse_null} | {bing_null} |{bing_filtered_null} |\n") + md_file.write(f"| Top (100%)| {finesse_top} | {bing_top} |{bing_filtered_top} |\n") + +def count_null_top_scores(accuracy_scores: dict): + null_scores = len([score for score in accuracy_scores.values() if score == 0]) + top_scores = len([score for score in accuracy_scores.values() if score == 1]) + + return null_scores, top_scores def save_to_csv(test_data: dict, engine: str): if not os.path.exists(OUTPUT_FOLDER): @@ -71,7 +87,7 @@ def save_to_csv(test_data: dict, engine: str): ]) writer.writerow([]) - time_stats, accuracy_stats = calculate_statistical_summary(test_data) + time_stats, accuracy_stats, bing_stats = calculate_statistical_summary(test_data) writer.writerow(["Statistic", "Time", "Accuracy Score"]) writer.writerow(["Mean", f"{int(time_stats.get('Mean'))}", f"{int(accuracy_stats.get('Mean'))}"]) writer.writerow(["Median", f"{int(time_stats.get('Median'))}", f"{int(accuracy_stats.get('Median'))}"]) @@ -79,67 +95,77 @@ def save_to_csv(test_data: dict, engine: str): writer.writerow(["Maximum", f"{int(time_stats.get('Maximum'))}", f"{int(accuracy_stats.get('Maximum'))}"]) writer.writerow(["Minimum", f"{int(time_stats.get('Minimum'))}", f"{int(accuracy_stats.get('Minimum'))}"]) -def log_data(test_data: dict): - for key, value in test_data.items(): - print("File:", key) - print("Question:", value.get("question")) - print("Expected URL:", value.get("expected_page").get("url")) - print(f'Accuracy Score: {int(value.get("accuracy")*100)}%') - print(f'Time: {int(value.get("time"))}ms') - print() - time_stats, accuracy_stats = calculate_statistical_summary(test_data) - print("---") - print(f"Tested on {len(test_data)} files.") - print("Time statistical summary:", end="\n ") - for key,value in time_stats.items(): - print(f"{key}:{int(value)},", end=' ') - print("\nAccuracy statistical summary:", end="\n ") - for key,value in accuracy_stats.items(): - print(f"{key}:{int(value*100)}%,", end=' ') - print("\n---") - - -def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]: +def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict, dict, dict, dict, dict]: + def calculate_stats(data: list) -> dict: + stats = { + "Mean": statistics.mean(data), + "Median": statistics.median(data), + "Standard Deviation": statistics.stdev(data), + "Maximum": max(data), + "Minimum": min(data), + } + return stats + + def round_values(stats: dict) -> dict: + return {key: int(round(value, 3)) for key, value in stats.items()} + + def convert_to_percentage(stats: dict) -> dict: + return {key: int(round(value * 100, 2)) for key, value in stats.items()} + times = [result.get("time") for result in test_data.values()] accuracies = [result.get("accuracy") for result in test_data.values()] - google = [result.get("google_accuracy") for result in test_data.values()] - time_stats = { - "Mean": round(statistics.mean(times), 3), - "Median": round(statistics.median(times), 3), - "Standard Deviation": round(statistics.stdev(times), 3), - "Maximum": round(max(times), 3), - "Minimum": round(min(times), 3), - } - accuracy_stats = { - "Mean": round(statistics.mean(accuracies), 2), - "Median": round(statistics.median(accuracies), 2), - "Standard Deviation": round(statistics.stdev(accuracies), 2), - "Maximum": round(max(accuracies), 2), - "Minimum": round(min(accuracies), 2), - } - google_stats= { - "Mean": round(statistics.mean(google), 2), - "Median": round(statistics.median(google), 2), - "Standard Deviation": round(statistics.stdev(google), 2), - "Maximum": round(max(google), 2), - "Minimum": round(min(google), 2), - } - return time_stats, accuracy_stats, google_stats - -def update_dict_google_data(test_data: dict): + bing_accuracies = [result.get("bing_accuracy") for result in test_data.values()] + bing_times = [result.get("bing_time") for result in test_data.values()] + bing_filtered_accuracies = [result.get("bing_filtered_accuracy") for result in test_data.values()] + bing_filtered_times = [result.get("bing_filtered_time") for result in test_data.values()] + + time_stats = calculate_stats(times) + accuracy_stats = calculate_stats(accuracies) + bing_accuracy_stats = calculate_stats(bing_accuracies) + bing_times_stats = calculate_stats(bing_times) + bing_filtered_accuracy_stats = calculate_stats(bing_filtered_accuracies) + bing_filtered_times_stats = calculate_stats(bing_filtered_times) + + time_stats = round_values(time_stats) + bing_times_stats = round_values(bing_times_stats) + bing_filtered_times_stats = round_values(bing_filtered_times_stats) + bing_accuracy_stats = convert_to_percentage(bing_accuracy_stats) + accuracy_stats = convert_to_percentage(accuracy_stats) + bing_filtered_accuracy_stats = convert_to_percentage(bing_filtered_accuracy_stats) + + return time_stats, accuracy_stats, bing_accuracy_stats, bing_times_stats, bing_filtered_accuracy_stats, bing_filtered_times_stats + +def update_dict_bing_data(test_data: dict): """ - Updates the given test_data dictionary with the Google accuracy results. + Updates the given test_data dictionary with the bing accuracy results. Args: test_data (dict): The dictionary containing the test data. """ - count = 0 + load_dotenv() + endpoint = os.getenv("BING_ENDPOINT") + subscription_key = os.getenv("BING_SEARCH_KEY") + search_engine = BingSearch(endpoint, subscription_key) + count = 1 for key, value in test_data.items(): question = value.get("question") expected_url = value.get("expected_page").get("url") top = value.get("top") - google_response_url = search_bing_urls(question, top) - google_accuracy_result = calculate_accuracy(google_response_url, expected_url) - value["google_accuracy"] = google_accuracy_result.score + response_url, time_elapsed = search_engine.search_urls(question, top) + accuracy_result = calculate_accuracy(response_url, expected_url) + value["bing_accuracy"] = accuracy_result.score + value["bing_time"] = time_elapsed + print(f"{count} files are done") + count += 1 + + count = 1 + for key, value in test_data.items(): + question = f"site:inspection.canada.ca {value.get('question')}" + expected_url = value.get("expected_page").get("url") + top = value.get("top") + response_url, time_elapsed = search_engine.search_urls(question, top) + accuracy_result = calculate_accuracy(response_url, expected_url) + value["bing_filtered_accuracy"] = accuracy_result.score + value["bing_filtered_time"] = time_elapsed + print(f"{count} files are done") count += 1 - print(f"{count} file is done") diff --git a/finesse/bing_search.py b/finesse/bing_search.py index 1013780..8c45a6b 100644 --- a/finesse/bing_search.py +++ b/finesse/bing_search.py @@ -1,28 +1,39 @@ +from azure.cognitiveservices.search.websearch import WebSearchClient +from msrest.authentication import CognitiveServicesCredentials +import time +import statistics +class BingSearch(): + """ + A class for performing web searches using the Bing Search API. + """ -import os -from pprint import pprint -import requests -from dotenv import load_dotenv -import os + def __init__(self, endpoint, subscription_key): + self.endpoint = endpoint + self.subscription_key = subscription_key + self.client = WebSearchClient(endpoint=self.endpoint, credentials=CognitiveServicesCredentials(self.subscription_key)) + self.client.config.base_url = '{Endpoint}/v7.0' # Temporary change to fix the error. Issue opened https://github.com/Azure/azure-sdk-for-python/issues/34917 -def search_bing_urls(query: str, num_results: int = 100) -> list[str]: - load_dotenv() - urls = [] - endpoint = os.getenv("BING_ENDPOINT") + "/v7.0/search" - subscription_key = os.getenv("BING_SEARCH_KEY") - mkt = 'en-US' - params = { 'q': query, 'mkt': mkt, 'count': 50 } - headers = { 'Ocp-Apim-Subscription-Key': subscription_key } - # Call the API - try: - response = requests.get(endpoint, headers=headers, params=params) - response.raise_for_status() + def search_urls(self, query: str, num_results: int = 100) -> tuple[list[str], float]: + """ + Search for URLs using the Bing Search API. - print("\nHeaders:\n") - print(response.headers) + Args: + query (str): The search query. + num_results (int, optional): The number of results to retrieve. Defaults to 100. - print("\nJSON Response:\n") - pprint(response.json()) - - except Exception as ex: - raise ex + Returns: + tuple[list[str], float]: A tuple containing a list of URLs and the average elapsed time for the search. + """ + urls = [] + elapsed_time = [] + offset = 0 + # Limit of 50 results per query and Bing Search return less than 50 web results + while len(urls) < num_results: + start_time = time.time() + web_data = self.client.web.search(query=query, market="en-ca", count=50, response_filter=["Webpages"], offset=offset) + elapsed_time.append(time.time() - start_time) + if hasattr(web_data, 'web_pages') and web_data.web_pages is not None: + urls.extend([item.url for item in web_data.web_pages.value]) + offset += len([item.url for item in web_data.web_pages.value]) + urls = urls[:num_results] + return urls, statistics.mean(elapsed_time) * 1000 diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py index 2cacb64..89e02fc 100644 --- a/finesse/finesse_test.py +++ b/finesse/finesse_test.py @@ -2,9 +2,10 @@ from jsonreader import JSONReader import os import json -from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy, update_dict_google_data +from accuracy_functions import save_to_markdown, save_to_csv, calculate_accuracy, update_dict_bing_data from host import is_host_up +global_test_data = dict() class NoTestDataError(Exception): """Raised when all requests have failed and there is no test data""" @@ -58,12 +59,12 @@ def search_accuracy(self): for page in response_pages: response_url.append(page.get("url")) accuracy_result = calculate_accuracy(response_url, expected_url) - time_taken = round(response.elapsed.microseconds/1000,3) + time_taken = round(response.elapsed.total_seconds()*1000,3) expected_page = json_data.copy() del expected_page['question'] del expected_page['answer'] - self.qna_results[file_name] = { + global_test_data[file_name] = { "question": question, "expected_page": expected_page, "response_pages": response_pages, @@ -76,22 +77,11 @@ def search_accuracy(self): def on_start(self): self.qna_reader = JSONReader(self.path) - self.qna_results = dict() def on_stop(self): - if not self.qna_results: + if not global_test_data: raise NoTestDataError - print("Search accuracy test completed") - print("Starting google search test") - - update_dict_google_data(self.qna_results) - log_data(self.qna_results) - if self.format == "md": - save_to_markdown(self.qna_results, self.engine) - elif self.format == "csv": - save_to_csv(self.qna_results, self.engine) - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.path = self.environment.parsed_options.path @@ -99,3 +89,14 @@ def __init__(self, *args, **kwargs): self.format = self.environment.parsed_options.format self.once = self.environment.parsed_options.once self.top = self.environment.parsed_options.top + +@events.quitting.add_listener +def quitting(environment, **_kwargs): + print("Search accuracy test completed") + print("Starting bing search test") + + update_dict_bing_data(global_test_data) + if environment.parsed_options.format == "md": + save_to_markdown(global_test_data, environment.parsed_options.engine) + elif environment.parsed_options.format == "csv": + save_to_csv(global_test_data, environment.parsed_options.engine) diff --git a/requirements.txt b/requirements.txt index 71f973f..5c18b02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ locust regex python-dotenv +azure-cognitiveservices-search-websearch +msrest From 43698dad7530f000e3cbb8342580948a2a083df1 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Wed, 27 Mar 2024 16:52:23 -0400 Subject: [PATCH 11/15] issue #6: Refactored + Fix markdown issue on links + Add parsing script to the repo+ Review csv function+ Sort files by number --- .gitignore | 7 +- finesse/FINESSE_USAGE.md | 57 +++++++++++-- finesse/accuracy_functions.py | 110 ++++++++++++++++++++----- finesse/bing_search.py | 5 +- finesse/finesse_test.py | 23 ++++-- finesse/jsonreader.py | 4 +- finesse/scripts/xlsx_converter_json.py | 56 +++++++++++++ requirements.txt | 2 + tests/test_accuracy_functions.py | 18 ++++ tests/test_bing_search.py | 17 +++- 10 files changed, 254 insertions(+), 45 deletions(-) create mode 100644 finesse/scripts/xlsx_converter_json.py diff --git a/.gitignore b/.gitignore index 12a7043..a9b9779 100644 --- a/.gitignore +++ b/.gitignore @@ -43,5 +43,8 @@ flask_session/ # Ignore local QnA json files QnA -# Ignore output of api-test -output +# Ignore output of api-test and from the scripts +output/ + +# Ignore input of the scripts +input/ diff --git a/finesse/FINESSE_USAGE.md b/finesse/FINESSE_USAGE.md index ad0b242..7c28387 100644 --- a/finesse/FINESSE_USAGE.md +++ b/finesse/FINESSE_USAGE.md @@ -1,8 +1,8 @@ # How to use the Finesse Locust script This tool simplifies the process of comparing different search engines and -assessing their accuracy. It's designed to be straightforward, making it easy -to understand and use. +assessing their accuracy. It's designed to be straightforward, making it easy to +understand and use. ## How it Works @@ -16,8 +16,8 @@ to understand and use. - `static`: Static search engine - `llamaindex`: LlamaIndex search engine - `--path [directory path]`: Point to the directory with files structured - - `--host [API URL]`: Point to the finesse-backend URL - with JSON files with the following properties: + - `--host [API URL]`: Point to the finesse-backend URL with JSON files with + the following properties: - `score`: The score of the page. - `crawl_id`: The unique identifier associated with the crawl table. - `chunk_id`: The unique identifier of the chunk. @@ -43,7 +43,8 @@ to understand and use. - **Round trip time** - Measure round trip time of each request - **Summary statistical value** - - Measure the average, median, standard deviation, minimum and maximal accuracy scores and round trip time + - Measure the average, median, standard deviation, minimum and maximal + accuracy scores and round trip time ## Diagram @@ -100,3 +101,49 @@ Accuracy statistical summary: This example shows how the CLI Output of the tool, analyzing search results from Azure Search and providing an accuracy score for Finesse. + +## Scripts + +### XLSX Converter to JSON πŸ“„ + +This script converts data from an Excel file (.xlsx) into JSON format. It is +used for questions generated created by non-developers. It is more readable an +excel than a json file. + +### Usage + +1. **Input Excel File**: Place the Excel file containing the data in the + specified input folder (`--input-folder`). By default, the input folder is + set to `'finesse/scripts/input/'`. + +2. **Output Folder**: Specify the folder where the resulting JSON files will be + saved using the `--output-folder` argument. By default, the output folder is + set to `'finesse/scripts/output/'`. + +3. **Input File Name**: Provide the name of the input Excel file using the + `--file-name` argument.. + +4. **Worksheet Name**: Specify the name of the worksheet containing the data + using the `--sheet-name` argument. By default, it is set to `'To fill'`. + +### Example Command + +```bash +python finesse/scripts/xlsx_converter_json.py --input-folder finesse/scripts/input/ --output-folder finesse/scripts/output/ --file-name Finesse_questions_for_testing.xlsx --sheet-name "To fill" +``` + +Replace `'example.xlsx'` with the actual name of your input Excel file and +`'Sheet1'` with the name of the worksheet containing the data. + +### Output + +The script generates individual JSON files for each row of data in the specified +output folder. Each JSON file contains the following fields: + +- `question`: The question extracted from the Excel file. +- `answer`: The answer extracted from the Excel file. +- `title`: The title(s) extracted from specified columns in the Excel file. +- `url`: The URL(s) extracted from specified columns in the Excel file. + +Upon completion, the script prints "Conversion terminΓ©e !" (Conversion +completed!) to indicate that the conversion process is finished. diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py index 1ab3ab9..05fd423 100644 --- a/finesse/accuracy_functions.py +++ b/finesse/accuracy_functions.py @@ -10,17 +10,37 @@ OUTPUT_FOLDER = "./finesse/output" AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"]) -def calculate_accuracy(responses_url: list[str], expected_url: str) -> AccuracyResult: +def calculate_accuracy(responses_url: list[str], expected_url: list | str) -> AccuracyResult: + """ + Calculates the accuracy of the responses by comparing the URLs of the responses with the expected URL. + + Args: + responses_url (list[str]): A list of URLs representing the responses. + expected_url (list[str] | str): The expected URL or a list of expected URLs. + + Returns: + AccuracyResult: An object containing the position, total pages, and score of the accuracy calculation. + """ position: int = 0 total_pages: int = len(responses_url) score: float = 0.0 - expected_number = int(re.findall(r'/(\d+)/', expected_url)[0]) + expected_number = [] + + PATTERN = r'/(\d+)/' + if isinstance(expected_url, list): + for url in expected_url: + if url.startswith("https://inspection.canada.ca"): + number = int(re.findall(PATTERN, url)[0]) + expected_number.append(number) + elif isinstance(expected_url, str) and expected_url.startswith("https://inspection.canada.ca"): + number = int(re.findall(PATTERN, expected_url)[0]) + expected_number.append(number) for idx, response_url in enumerate(responses_url): if response_url.startswith("https://inspection.canada.ca"): try: - response_number = int(re.findall(r'/(\d+)/', response_url)[0]) - if response_number == expected_number: + response_number = int(re.findall(PATTERN, response_url)[0]) + if response_number in expected_number: position = idx score = 1 - (position / total_pages) score= round(score, 2) @@ -42,7 +62,15 @@ def save_to_markdown(test_data: dict, engine: str): md_file.write("| πŸ“„ File | πŸ’¬ Question| πŸ”Ž Finesse Accuracy Score | 🌐 Bing Accuracy Score | 🌐 Filtered Bing Accuracy Score |βŒ› Finesse Time | βŒ› Bing Time | βŒ› Filtered Bing Time |\n") md_file.write("|---|---|---|---|---|---|---|---|\n") for key, value in test_data.items(): - md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {int(value.get('accuracy')*100)}% | {int(value.get('bing_accuracy')*100)}% |{int(value.get('bing_filtered_accuracy')*100)}% |{int(value.get('time'))}ms | {int(value.get('bing_time'))}ms | {int(value.get('bing_filtered_time'))}ms |\n") + question = "" + if isinstance(value.get("expected_page").get("url"), list): + question = f"{value.get('question')} " + for index, url in enumerate(value.get("expected_page").get("url")): + question += f"\| [Link{index+1}]({url}) " + question += "\|" + else: + question = f"[{value.get('question')}]({value.get('expected_page').get('url')})" + md_file.write(f"| {key} | {question} | {int(value.get('accuracy')*100)}% | {int(value.get('bing_accuracy')*100)}% |{int(value.get('bing_filtered_accuracy')*100)}% |{int(value.get('time'))}ms | {int(value.get('bing_time'))}ms | {int(value.get('bing_filtered_time'))}ms |\n") md_file.write("\n") md_file.write(f"Tested on {len(test_data)} files.\n\n") @@ -64,6 +92,15 @@ def save_to_markdown(test_data: dict, engine: str): md_file.write(f"| Top (100%)| {finesse_top} | {bing_top} |{bing_filtered_top} |\n") def count_null_top_scores(accuracy_scores: dict): + """ + Counts the number of null scores and top scores in the given accuracy_scores dictionary. + + Args: + accuracy_scores (dict): A dictionary containing accuracy scores. + + Returns: + tuple: A tuple containing the count of null scores and top scores, respectively. + """ null_scores = len([score for score in accuracy_scores.values() if score == 0]) top_scores = len([score for score in accuracy_scores.values() if score == 1]) @@ -77,25 +114,52 @@ def save_to_csv(test_data: dict, engine: str): output_file = os.path.join(OUTPUT_FOLDER, file_name) with open(output_file, "w", newline="") as csv_file: writer = csv.writer(csv_file) - writer.writerow(["File", "Question", "Accuracy Score", "Time"]) + writer.writerow(["File", "Question", "Finesse Accuracy Score", "Bing Accuracy Score", "Filtered Bing Accuracy Score", "Finesse Time", "Bing Time", "Filtered Bing Time"]) for key, value in test_data.items(): + question = "" + if isinstance(value.get("expected_page").get("url"), list): + question = f"{value.get('question')} " + for index, url in enumerate(value.get("expected_page").get("url")): + question += f"[{index+1}]({url}) " + else: + question = f"[{value.get('question')}]({value.get('expected_page').get('url')})" writer.writerow([ key, - value.get("question"), - f"{value.get('accuracy')}", - f"{int(value.get('time'))}" + question, + f"{int(value.get('accuracy')*100)}%", + f"{int(value.get('bing_accuracy')*100)}%", + f"{int(value.get('bing_filtered_accuracy')*100)}%", + f"{int(value.get('time'))}ms", + f"{int(value.get('bing_time'))}ms", + f"{int(value.get('bing_filtered_time'))}ms" ]) writer.writerow([]) - time_stats, accuracy_stats, bing_stats = calculate_statistical_summary(test_data) - writer.writerow(["Statistic", "Time", "Accuracy Score"]) - writer.writerow(["Mean", f"{int(time_stats.get('Mean'))}", f"{int(accuracy_stats.get('Mean'))}"]) - writer.writerow(["Median", f"{int(time_stats.get('Median'))}", f"{int(accuracy_stats.get('Median'))}"]) - writer.writerow(["Standard Deviation", f"{int(time_stats.get('Standard Deviation'))}", f"{int(accuracy_stats.get('Standard Deviation'))}"]) - writer.writerow(["Maximum", f"{int(time_stats.get('Maximum'))}", f"{int(accuracy_stats.get('Maximum'))}"]) - writer.writerow(["Minimum", f"{int(time_stats.get('Minimum'))}", f"{int(accuracy_stats.get('Minimum'))}"]) + time_stats, accuracy_stats, bing_accuracy_stats, bing_time_stats, bing_filtered_accuracy_stats, bing_filtered_time_stats = calculate_statistical_summary(test_data) + writer.writerow(["Statistic", "Finesse Accuracy Score", "Bing Accuracy Score", "Filtered Bing Accuracy Score", "Finesse Time", "Bing Time", "Filtered Bing Time"]) + writer.writerow(["Mean", f"{accuracy_stats.get('Mean')}%", f"{bing_accuracy_stats.get('Mean')}%", f"{bing_filtered_accuracy_stats.get('Mean')}%", f"{time_stats.get('Mean')}ms", f"{bing_time_stats.get('Mean')}ms", f"{bing_filtered_time_stats.get('Mean')}ms"]) + writer.writerow(["Median", f"{accuracy_stats.get('Median')}%", f"{bing_accuracy_stats.get('Median')}%", f"{bing_filtered_accuracy_stats.get('Median')}%", f"{time_stats.get('Median')}ms", f"{bing_time_stats.get('Median')}ms", f"{bing_filtered_time_stats.get('Median')}ms"]) + writer.writerow(["Standard Deviation", f"{accuracy_stats.get('Standard Deviation')}%", f"{bing_accuracy_stats.get('Standard Deviation')}%", f"{bing_filtered_accuracy_stats.get('Standard Deviation')}%", f"{time_stats.get('Standard Deviation')}ms", f"{bing_time_stats.get('Standard Deviation')}ms", f"{bing_filtered_time_stats.get('Standard Deviation')}ms"]) + writer.writerow(["Maximum", f"{accuracy_stats.get('Maximum')}%", f"{bing_accuracy_stats.get('Maximum')}%", f"{bing_filtered_accuracy_stats.get('Maximum')}%", f"{time_stats.get('Maximum')}ms", f"{bing_time_stats.get('Maximum')}ms", f"{bing_filtered_time_stats.get('Maximum')}ms"]) + writer.writerow(["Minimum", f"{accuracy_stats.get('Minimum')}%", f"{bing_accuracy_stats.get('Minimum')}%", f"{bing_filtered_accuracy_stats.get('Minimum')}%", f"{time_stats.get('Minimum')}ms", f"{bing_time_stats.get('Minimum')}ms", f"{bing_filtered_time_stats.get('Minimum')}ms"]) def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict, dict, dict, dict, dict]: + """ + Calculate the statistical summary of the test data. + + Args: + test_data (dict): A dictionary containing the test data. + + Returns: + tuple[dict, dict, dict, dict, dict, dict]: A tuple containing the statistical summary for different metrics. + The tuple contains the following dictionaries: + - time_stats: Statistical summary for the 'time' metric. + - accuracy_stats: Statistical summary for the 'accuracy' metric. + - bing_accuracy_stats: Statistical summary for the 'bing_accuracy' metric. + - bing_times_stats: Statistical summary for the 'bing_times' metric. + - bing_filtered_accuracy_stats: Statistical summary for the 'bing_filtered_accuracy' metric. + - bing_filtered_times_stats: Statistical summary for the 'bing_filtered_times' metric. + """ def calculate_stats(data: list) -> dict: stats = { "Mean": statistics.mean(data), @@ -142,30 +206,32 @@ def update_dict_bing_data(test_data: dict): Args: test_data (dict): The dictionary containing the test data. """ + copy_data = test_data.copy() load_dotenv() endpoint = os.getenv("BING_ENDPOINT") subscription_key = os.getenv("BING_SEARCH_KEY") search_engine = BingSearch(endpoint, subscription_key) count = 1 - for key, value in test_data.items(): + for key, value in copy_data.items(): question = value.get("question") expected_url = value.get("expected_page").get("url") top = value.get("top") response_url, time_elapsed = search_engine.search_urls(question, top) accuracy_result = calculate_accuracy(response_url, expected_url) - value["bing_accuracy"] = accuracy_result.score - value["bing_time"] = time_elapsed + test_data[key]["bing_accuracy"] = accuracy_result.score + test_data[key]["bing_time"] = time_elapsed print(f"{count} files are done") count += 1 + print("Second Bing Search Test") count = 1 - for key, value in test_data.items(): + for key, value in copy_data.items(): question = f"site:inspection.canada.ca {value.get('question')}" expected_url = value.get("expected_page").get("url") top = value.get("top") response_url, time_elapsed = search_engine.search_urls(question, top) accuracy_result = calculate_accuracy(response_url, expected_url) - value["bing_filtered_accuracy"] = accuracy_result.score - value["bing_filtered_time"] = time_elapsed + test_data[key]["bing_filtered_accuracy"] = accuracy_result.score + test_data[key]["bing_filtered_time"] = time_elapsed print(f"{count} files are done") count += 1 diff --git a/finesse/bing_search.py b/finesse/bing_search.py index 8c45a6b..a9c5016 100644 --- a/finesse/bing_search.py +++ b/finesse/bing_search.py @@ -34,6 +34,9 @@ def search_urls(self, query: str, num_results: int = 100) -> tuple[list[str], fl elapsed_time.append(time.time() - start_time) if hasattr(web_data, 'web_pages') and web_data.web_pages is not None: urls.extend([item.url for item in web_data.web_pages.value]) - offset += len([item.url for item in web_data.web_pages.value]) + try: + offset += len([item.url for item in web_data.web_pages.value]) + except AttributeError: + break urls = urls[:num_results] return urls, statistics.mean(elapsed_time) * 1000 diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py index 89e02fc..8ed9aee 100644 --- a/finesse/finesse_test.py +++ b/finesse/finesse_test.py @@ -6,6 +6,7 @@ from host import is_host_up global_test_data = dict() +settings = dict() class NoTestDataError(Exception): """Raised when all requests have failed and there is no test data""" @@ -60,7 +61,6 @@ def search_accuracy(self): response_url.append(page.get("url")) accuracy_result = calculate_accuracy(response_url, expected_url) time_taken = round(response.elapsed.total_seconds()*1000,3) - expected_page = json_data.copy() del expected_page['question'] del expected_page['answer'] @@ -72,7 +72,7 @@ def search_accuracy(self): "total_pages": accuracy_result.total_pages, "accuracy": accuracy_result.score, "time": time_taken, - "top": self.top + "top": self.top, } def on_start(self): @@ -89,14 +89,19 @@ def __init__(self, *args, **kwargs): self.format = self.environment.parsed_options.format self.once = self.environment.parsed_options.once self.top = self.environment.parsed_options.top + settings["engine"] = self.engine + settings["format"] = self.format + settings["once"] = self.once + settings["top"] = self.top + settings["path"] = self.path + -@events.quitting.add_listener -def quitting(environment, **_kwargs): +@events.quit.add_listener +def quit(**_kwargs): print("Search accuracy test completed") print("Starting bing search test") - update_dict_bing_data(global_test_data) - if environment.parsed_options.format == "md": - save_to_markdown(global_test_data, environment.parsed_options.engine) - elif environment.parsed_options.format == "csv": - save_to_csv(global_test_data, environment.parsed_options.engine) + if settings.get("format") == "md": + save_to_markdown(global_test_data, "azure") + elif settings.get("format") == "csv": + save_to_csv(global_test_data, settings.get("engine")) diff --git a/finesse/jsonreader.py b/finesse/jsonreader.py index 8f1a8ef..ee61250 100644 --- a/finesse/jsonreader.py +++ b/finesse/jsonreader.py @@ -1,13 +1,13 @@ import json from typing import Iterator import os - +from natsort import natsorted class JSONReader(Iterator): "Read test data from JSON files using an iterator" def __init__(self, directory): self.directory = directory - self.file_list = sorted([f for f in os.listdir(directory) if f.endswith('.json')]) + self.file_list = natsorted([f for f in os.listdir(directory) if f.endswith('.json')]) if not self.file_list: raise FileNotFoundError(f"No JSON files found in the directory '{directory}'") self.current_file_index = 0 diff --git a/finesse/scripts/xlsx_converter_json.py b/finesse/scripts/xlsx_converter_json.py new file mode 100644 index 0000000..733a14b --- /dev/null +++ b/finesse/scripts/xlsx_converter_json.py @@ -0,0 +1,56 @@ +import openpyxl +import os +import json +import argparse + +parser = argparse.ArgumentParser(description='XLSX Converter to JSON') +parser.add_argument('--input-folder', dest='input_folder', default='finesse/scripts/input/', help='Path to the input folder') +parser.add_argument('--output-folder', dest='output_folder', default='finesse/scripts/output/', help='Path to the output folder') +parser.add_argument('--file-name', dest='file_name', help='Name of the input file') +parser.add_argument('--sheet-name', dest='sheet_name', default='To fill', help='Name of the worksheet') + +args = parser.parse_args() + +INPUT_FOLDER = args.input_folder +OUTPUT_FOLDER = args.output_folder +FILE_NAME = args.file_name +SHEET_NAME = args.sheet_name +FILE_PATH = INPUT_FOLDER+FILE_NAME + +workbook = openpyxl.load_workbook(FILE_PATH) +worksheet = workbook.active +count = 1 + +for row in range(5, worksheet.max_row + 1): + question = worksheet.cell(row=row, column=2).value + if question is None: + continue + + answer = worksheet.cell(row=row, column=3).value + + titles = [] + links = [] + for col in range(5, 10): + title = worksheet.cell(row=row, column=col).value + link = worksheet.cell(row=row, column=col).hyperlink + if title: + titles.append(title) + if link: + links.append(link.target) + + data = { + 'question': question or "", + 'answer': answer or "", + 'title': titles[0] if len(titles) == 1 else titles or "", + 'url': links[0] if len(links) == 1 else links or "" + } + + # Enregistrement du fichier JSON + output_file = os.path.join(OUTPUT_FOLDER, f'question_{count}.json') + if not os.path.exists(OUTPUT_FOLDER): + os.makedirs(OUTPUT_FOLDER) + with open(output_file, 'w', encoding='utf-8') as json_file: + json.dump(data, json_file, ensure_ascii=False, indent=4) + count += 1 + +print("Conversion completed successfully!") diff --git a/requirements.txt b/requirements.txt index 5c18b02..90effa8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ regex python-dotenv azure-cognitiveservices-search-websearch msrest +openpyxl +natsort diff --git a/tests/test_accuracy_functions.py b/tests/test_accuracy_functions.py index f50e357..b11fe57 100644 --- a/tests/test_accuracy_functions.py +++ b/tests/test_accuracy_functions.py @@ -16,5 +16,23 @@ def test_calculate_accuracy(self): self.assertEqual(result.total_pages, 4) self.assertEqual(result.score, 0.75) + def test_calculate_accuracy_multiple_expected_urls(self): + responses_url = [ + "https://inspection.canada.ca/exporting-food-plants-or-animals/food-exports/food-specific-export-requirements/meat/crfpcp/eng/1434119937443/1434120400252", + "https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-08-04/fra/1323752901318/1323753612811", + "https://inspection.canada.ca/varietes-vegetales/vegetaux-a-caracteres-nouveaux/demandeurs/directive-94-08/documents-sur-la-biologie/lens-culinaris-medikus-lentille-/fra/1330978380871/1330978449837", + "https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-96-15/fra/1323854808025/1323854941807" + ] + expected_urls = [ + "https://inspection.canada.ca/animal-health/terrestrial-animals/exports/pets/brunei-darussalam/eng/1475849543824/1475849672294", + "https://inspection.canada.ca/animal-health/terrestrial-animals/exports/pets/eu-commercial-/instructions/eng/1447782811647/1447782887583", + "https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-96-15/fra/1323854808025/1323854941807", + "https://inspection.canada.ca/varietes-vegetales/vegetaux-a-caracteres-nouveaux/demandeurs/directive-94-08/documents-sur-la-biologie/lens-culinaris-medikus-lentille-/fra/1330978380871/1330978449837" + ] + result = calculate_accuracy(responses_url, expected_urls) + self.assertEqual(result.position, 2) + self.assertEqual(result.total_pages, 4) + self.assertEqual(result.score, 0.5) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_bing_search.py b/tests/test_bing_search.py index a5cc355..83deba9 100644 --- a/tests/test_bing_search.py +++ b/tests/test_bing_search.py @@ -1,13 +1,22 @@ import unittest -from finesse.bing_search import search_bing_urls - +from finesse.bing_search import BingSearch +from dotenv import load_dotenv +import os class TestBingSearch(unittest.TestCase): - def test_get_bing_search_urls(self): + def test_search_urls(self): + load_dotenv() + endpoint = os.getenv("BING_ENDPOINT") + subscription_key = os.getenv("BING_SEARCH_KEY") + bing_search = BingSearch(endpoint, subscription_key) + query = "Canada Food Inspection Agency" num_results = 100 - urls = search_bing_urls(query, num_results) + + urls, elapsed_time = bing_search.search_urls(query, num_results) + self.assertEqual(len(urls), num_results) self.assertTrue(all(url.startswith("http") for url in urls)) + self.assertIsInstance(elapsed_time, float) if __name__ == "__main__": unittest.main() From 91f49fcf4f13e026c971c388692bf0321ecf4590 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Tue, 2 Apr 2024 10:47:14 -0400 Subject: [PATCH 12/15] issue #6: typos --- tests/test_bing_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_bing_search.py b/tests/test_bing_search.py index 83deba9..4c73a7a 100644 --- a/tests/test_bing_search.py +++ b/tests/test_bing_search.py @@ -9,7 +9,7 @@ def test_search_urls(self): subscription_key = os.getenv("BING_SEARCH_KEY") bing_search = BingSearch(endpoint, subscription_key) - query = "Canada Food Inspection Agency" + query = "Canadian Food Inspection Agency" num_results = 100 urls, elapsed_time = bing_search.search_urls(query, num_results) From 77650ddce75273459c6dca830cfe8de612bb0a63 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Tue, 2 Apr 2024 10:47:33 -0400 Subject: [PATCH 13/15] issue #6: typo --- finesse/FINESSE_USAGE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finesse/FINESSE_USAGE.md b/finesse/FINESSE_USAGE.md index 7c28387..6546ea1 100644 --- a/finesse/FINESSE_USAGE.md +++ b/finesse/FINESSE_USAGE.md @@ -107,8 +107,8 @@ Azure Search and providing an accuracy score for Finesse. ### XLSX Converter to JSON πŸ“„ This script converts data from an Excel file (.xlsx) into JSON format. It is -used for questions generated created by non-developers. It is more readable an -excel than a json file. +used for questions generated created by non-developers. Excel files are easier +to read than JSON files. ### Usage From 7cc22af1dfbc5556c0864a8c6a456ae4e7e90211 Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Wed, 3 Apr 2024 13:44:36 -0400 Subject: [PATCH 14/15] issue #6: ospathjoin on script --- finesse/scripts/xlsx_converter_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finesse/scripts/xlsx_converter_json.py b/finesse/scripts/xlsx_converter_json.py index 733a14b..e455c66 100644 --- a/finesse/scripts/xlsx_converter_json.py +++ b/finesse/scripts/xlsx_converter_json.py @@ -15,7 +15,7 @@ OUTPUT_FOLDER = args.output_folder FILE_NAME = args.file_name SHEET_NAME = args.sheet_name -FILE_PATH = INPUT_FOLDER+FILE_NAME +FILE_PATH = os.path.join(INPUT_FOLDER, FILE_NAME) workbook = openpyxl.load_workbook(FILE_PATH) worksheet = workbook.active From 62b225d3d8a10e8a8830503bb99c9de4ec7fb86b Mon Sep 17 00:00:00 2001 From: Ibrahim Kabir Date: Wed, 3 Apr 2024 13:46:33 -0400 Subject: [PATCH 15/15] issue #6: gitignore cache --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index a9b9779..fba33b9 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,6 @@ output/ # Ignore input of the scripts input/ + +# Ignore the cache directory +cache/