diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..1fc0389 --- /dev/null +++ b/.env.template @@ -0,0 +1,2 @@ +GOOGLE_API_KEY = +GOOGLE_CSE_ID = diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py index ea5217c..341422d 100644 --- a/finesse/accuracy_functions.py +++ b/finesse/accuracy_functions.py @@ -4,6 +4,7 @@ import os from collections import namedtuple import regex as re +from finesse.google_search import search_google_urls OUTPUT_FOLDER = "./finesse/output" AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"]) @@ -34,22 +35,22 @@ def save_to_markdown(test_data: dict, engine: str): with open(output_file, "w") as md_file: md_file.write(f"# Test on the {engine} search engine: {date_string}\n\n") md_file.write("## Test data table\n\n") - md_file.write("| 📄 File | 💬 Question | 📏 Accuracy Score | ⌛ Time |\n") + md_file.write("| 📄 File | 💬 Question | 📏 Accuracy Score | 🌐 Google Score |⌛ Time |\n") md_file.write("|--------------------|-------------------------------------------------------------------------------------------------------------------------|----------------|----------|\n") for key, value in test_data.items(): - md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {int(value.get('time'))}ms |\n") + md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {value.get('google_accuracy')*100}% |{int(value.get('time'))}ms |\n") md_file.write("\n") md_file.write(f"Tested on {len(test_data)} files.\n\n") - time_stats, accuracy_stats = calculate_statistical_summary(test_data) + time_stats, accuracy_stats, google_stats = calculate_statistical_summary(test_data) md_file.write("## Statistical summary\n\n") - md_file.write("| Statistic | Time | Accuracy score|\n") + md_file.write("| Statistic | ⌛ Time | 📏 Accuracy score| 🌐 Google Score |\n") md_file.write("|-----------------------|------------|---------|\n") - md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |\n") - md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% |\n") - md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% |\n") - md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% |\n") - md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% |\n") + md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |{int(google_stats.get('Mean')*100)}% |\n") + md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% | {int(google_stats.get('Median')*100)}% |\n") + md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% | {int(google_stats.get('Standard Deviation')*100)}% |\n") + md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% | {int(google_stats.get('Maximum')*100)}% |\n") + md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% | {int(google_stats.get('Minimum')*100)}% |\n") md_file.write(f"\nThere are a total of {len([result.get('accuracy') for result in test_data.values() if result.get('accuracy') == 0])} null scores\n") def save_to_csv(test_data: dict, engine: str): @@ -101,6 +102,7 @@ def log_data(test_data: dict): def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]: times = [result.get("time") for result in test_data.values()] accuracies = [result.get("accuracy") for result in test_data.values()] + google = [result.get("google_accuracy") for result in test_data.values()] time_stats = { "Mean": round(statistics.mean(times), 3), "Median": round(statistics.median(times), 3), @@ -115,4 +117,29 @@ def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]: "Maximum": round(max(accuracies), 2), "Minimum": round(min(accuracies), 2), } - return time_stats, accuracy_stats + google_stats= { + "Mean": round(statistics.mean(google), 2), + "Median": round(statistics.median(google), 2), + "Standard Deviation": round(statistics.stdev(google), 2), + "Maximum": round(max(google), 2), + "Minimum": round(min(google), 2), + } + return time_stats, accuracy_stats, google_stats + +def update_dict_google_data(test_data: dict): + """ + Updates the given test_data dictionary with the Google accuracy results. + + Args: + test_data (dict): The dictionary containing the test data. + """ + count = 0 + for key, value in test_data.items(): + question = value.get("question") + expected_url = value.get("expected_page").get("url") + top = value.get("top") + google_response_url = search_google_urls(question, top) + google_accuracy_result = calculate_accuracy(google_response_url, expected_url) + value["google_accuracy"] = google_accuracy_result.score + count += 1 + print(f"{count} file is done") diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py index f5467c1..3d783e3 100644 --- a/finesse/finesse_test.py +++ b/finesse/finesse_test.py @@ -2,9 +2,8 @@ from jsonreader import JSONReader import os import json -from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy +from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy, update_dict_google_data from host import is_host_up -from google_search import get_google_search_urls class NoTestDataError(Exception): """Raised when all requests have failed and there is no test data""" @@ -67,8 +66,6 @@ def search_accuracy(self): for page in response_pages: response_url.append(page.get("url")) accuracy_result = calculate_accuracy(response_url, expected_url) - google_response_url = get_google_search_urls(question) - google_accuracy_result = calculate_accuracy(google_response_url, expected_url) time_taken = round(response.elapsed.microseconds/1000,3) expected_page = json_data.copy() @@ -81,8 +78,8 @@ def search_accuracy(self): "position": accuracy_result.position, "total_pages": accuracy_result.total_pages, "accuracy": accuracy_result.score, - "google_accuracy": google_accuracy_result, "time": time_taken, + "top": self.top } def on_start(self): @@ -93,6 +90,10 @@ def on_stop(self): if not self.qna_results: raise NoTestDataError + print("Search accuracy test completed") + print("Starting google search test") + + update_dict_google_data(self.qna_results) log_data(self.qna_results) if self.format == "md": save_to_markdown(self.qna_results, self.engine) diff --git a/finesse/google_search.py b/finesse/google_search.py index 3abd404..05271bf 100644 --- a/finesse/google_search.py +++ b/finesse/google_search.py @@ -1,8 +1,15 @@ -from googlesearch import search +from googleapiclient.discovery import build +from dotenv import load_dotenv +import os -def get_google_search_urls(query: str, num_results: int = 100) -> list[str]: +def google_search(search_term, api_key, cse_id, **kwargs): + service = build("customsearch", "v1", developerKey=api_key) + res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute() + return res['items'] + +def search_google_urls(query: str, num_results: int = 100) -> list[str]: """ - Retrieves a list of Google search result URLs for the given query. + Retrieves a list of Google search result URLs for the given query using the Google API. Args: query (str): The search query. @@ -10,9 +17,15 @@ def get_google_search_urls(query: str, num_results: int = 100) -> list[str]: Returns: list[str]: A list of URLs representing the search results. + + Raises: + Exception: If the request limit is exceeded (error 429 Too Many Requests). """ - num_results -= 2 # 2 extra urls are added by googlesearch library + load_dotenv() links = [] - for url in search(query, num_results, sleep_interval=1): - links.append(url) + api_key = os.getenv("GOOGLE_API_KEY") + cse_id = os.getenv("GOOGLE_CSE_ID") + results = google_search(query, api_key, cse_id, start=11) + for item in results: + links.append(item['link']) return links diff --git a/requirements.txt b/requirements.txt index 62475fc..8acc2a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ locust regex -googlesearch-python +google-api-python-client +python-dotenv diff --git a/tests/test_google_search.py b/tests/test_google_search.py index 901d99d..0285042 100644 --- a/tests/test_google_search.py +++ b/tests/test_google_search.py @@ -1,11 +1,11 @@ import unittest -from finesse.google_search import get_google_search_urls +from finesse.google_search import search_google_urls class TestGoogleSearch(unittest.TestCase): def test_get_google_search_urls(self): query = "Canada Food Inspection Agency" - num_results = 10 - urls = get_google_search_urls(query, num_results) + num_results = 100 + urls = search_google_urls(query, num_results) self.assertEqual(len(urls), num_results) self.assertTrue(all(url.startswith("http") for url in urls))