issue #6: google api incorporation

ai-cfia · Apr 2, 2024 · 232208a · 232208a
1 parent bf895a3
commit 232208a
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 25 deletions.
diff --git a/.env.template b/.env.template
@@ -0,0 +1,2 @@
+GOOGLE_API_KEY =
+GOOGLE_CSE_ID =
diff --git a/finesse/accuracy_functions.py b/finesse/accuracy_functions.py
@@ -4,6 +4,7 @@
 import os
 from collections import namedtuple
 import regex as re
+from finesse.google_search import search_google_urls
 
 OUTPUT_FOLDER = "./finesse/output"
 AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"])
@@ -34,22 +35,22 @@ def save_to_markdown(test_data: dict, engine: str):
     with open(output_file, "w") as md_file:
         md_file.write(f"# Test on the {engine} search engine: {date_string}\n\n")
         md_file.write("## Test data table\n\n")
-        md_file.write("| 📄 File               | 💬 Question                                                                                                                | 📏 Accuracy Score | ⌛ Time     |\n")
+        md_file.write("| 📄 File               | 💬 Question                                                                                                                | 📏 Accuracy Score | 🌐 Google Score |⌛ Time     |\n")
         md_file.write("|--------------------|-------------------------------------------------------------------------------------------------------------------------|----------------|----------|\n")
         for key, value in test_data.items():
-            md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {int(value.get('time'))}ms |\n")
+            md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {value.get('google_accuracy')*100}% |{int(value.get('time'))}ms |\n")
         md_file.write("\n")
         md_file.write(f"Tested on {len(test_data)} files.\n\n")
 
-        time_stats, accuracy_stats = calculate_statistical_summary(test_data)
+        time_stats, accuracy_stats, google_stats = calculate_statistical_summary(test_data)
         md_file.write("## Statistical summary\n\n")
-        md_file.write("| Statistic             | Time       | Accuracy score|\n")
+        md_file.write("| Statistic             | ⌛ Time       | 📏 Accuracy score| 🌐 Google Score |\n")
         md_file.write("|-----------------------|------------|---------|\n")
-        md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |\n")
-        md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% |\n")
-        md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% |\n")
-        md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% |\n")
-        md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% |\n")
+        md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |{int(google_stats.get('Mean')*100)}% |\n")
+        md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% | {int(google_stats.get('Median')*100)}% |\n")
+        md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% | {int(google_stats.get('Standard Deviation')*100)}% |\n")
+        md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% | {int(google_stats.get('Maximum')*100)}% |\n")
+        md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% | {int(google_stats.get('Minimum')*100)}% |\n")
         md_file.write(f"\nThere are a total of {len([result.get('accuracy') for result in test_data.values() if result.get('accuracy') == 0])} null scores\n")
 
 def save_to_csv(test_data: dict, engine: str):
@@ -101,6 +102,7 @@ def log_data(test_data: dict):
 def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]:
     times = [result.get("time") for result in test_data.values()]
     accuracies = [result.get("accuracy") for result in test_data.values()]
+    google = [result.get("google_accuracy") for result in test_data.values()]
     time_stats = {
         "Mean": round(statistics.mean(times), 3),
         "Median": round(statistics.median(times), 3),
@@ -115,4 +117,29 @@ def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]:
         "Maximum": round(max(accuracies), 2),
         "Minimum": round(min(accuracies), 2),
     }
-    return time_stats, accuracy_stats
+    google_stats= {
+        "Mean": round(statistics.mean(google), 2),
+        "Median": round(statistics.median(google), 2),
+        "Standard Deviation": round(statistics.stdev(google), 2),
+        "Maximum": round(max(google), 2),
+        "Minimum": round(min(google), 2),
+    }
+    return time_stats, accuracy_stats, google_stats
+
+def update_dict_google_data(test_data: dict):
+    """
+    Updates the given test_data dictionary with the Google accuracy results.
+
+    Args:
+        test_data (dict): The dictionary containing the test data.
+    """
+    count = 0
+    for key, value in test_data.items():
+        question = value.get("question")
+        expected_url = value.get("expected_page").get("url")
+        top = value.get("top")
+        google_response_url = search_google_urls(question, top)
+        google_accuracy_result = calculate_accuracy(google_response_url, expected_url)
+        value["google_accuracy"] = google_accuracy_result.score
+        count += 1
+        print(f"{count} file is done")
diff --git a/finesse/finesse_test.py b/finesse/finesse_test.py
@@ -2,9 +2,8 @@
 from jsonreader import JSONReader
 import os
 import json
-from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy
+from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy, update_dict_google_data
 from host import is_host_up
-from google_search import get_google_search_urls
 
 class NoTestDataError(Exception):
     """Raised when all requests have failed and there is no test data"""
@@ -67,8 +66,6 @@ def search_accuracy(self):
                 for page in response_pages:
                     response_url.append(page.get("url"))
                 accuracy_result = calculate_accuracy(response_url, expected_url)
-                google_response_url = get_google_search_urls(question)
-                google_accuracy_result = calculate_accuracy(google_response_url, expected_url)
                 time_taken = round(response.elapsed.microseconds/1000,3)
 
                 expected_page = json_data.copy()
@@ -81,8 +78,8 @@ def search_accuracy(self):
                     "position": accuracy_result.position,
                     "total_pages": accuracy_result.total_pages,
                     "accuracy": accuracy_result.score,
-                    "google_accuracy": google_accuracy_result,
                     "time": time_taken,
+                    "top": self.top
                 }
 
     def on_start(self):
@@ -93,6 +90,10 @@ def on_stop(self):
         if not self.qna_results:
             raise NoTestDataError
 
+        print("Search accuracy test completed")
+        print("Starting google search test")
+
+        update_dict_google_data(self.qna_results)
         log_data(self.qna_results)
         if self.format == "md":
             save_to_markdown(self.qna_results, self.engine)

diff --git a/finesse/google_search.py b/finesse/google_search.py
@@ -1,18 +1,31 @@
-from googlesearch import search
+from googleapiclient.discovery import build
+from dotenv import load_dotenv
+import os
 
-def get_google_search_urls(query: str, num_results: int = 100) -> list[str]:
+def google_search(search_term, api_key, cse_id, **kwargs):
+    service = build("customsearch", "v1", developerKey=api_key)
+    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
+    return res['items']
+
+def search_google_urls(query: str, num_results: int = 100) -> list[str]:
     """
-    Retrieves a list of Google search result URLs for the given query.
+    Retrieves a list of Google search result URLs for the given query using the Google API.
 
     Args:
         query (str): The search query.
         num_results (int, optional): The number of search results to retrieve. Defaults to 100.
 
     Returns:
         list[str]: A list of URLs representing the search results.
+
+    Raises:
+        Exception: If the request limit is exceeded (error 429 Too Many Requests).
     """
-    num_results -= 2 # 2 extra urls are added by googlesearch library
+    load_dotenv()
     links = []
-    for url in search(query, num_results, sleep_interval=1):
-        links.append(url)
+    api_key = os.getenv("GOOGLE_API_KEY")
+    cse_id = os.getenv("GOOGLE_CSE_ID")
+    results = google_search(query, api_key, cse_id, start=11)
+    for item in results:
+        links.append(item['link'])
     return links
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 locust
 regex
-googlesearch-python
+google-api-python-client
+python-dotenv
diff --git a/tests/test_google_search.py b/tests/test_google_search.py
@@ -1,11 +1,11 @@
 import unittest
-from finesse.google_search import get_google_search_urls
+from finesse.google_search import search_google_urls
 
 class TestGoogleSearch(unittest.TestCase):
     def test_get_google_search_urls(self):
         query = "Canada Food Inspection Agency"
-        num_results = 10
-        urls = get_google_search_urls(query, num_results)
+        num_results = 100
+        urls = search_google_urls(query, num_results)
         self.assertEqual(len(urls), num_results)
         self.assertTrue(all(url.startswith("http") for url in urls))