Skip to content

Commit

Permalink
issue #6: google api incorporation
Browse files Browse the repository at this point in the history
  • Loading branch information
ibrahim-kabir committed Apr 2, 2024
1 parent bf895a3 commit 232208a
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 25 deletions.
2 changes: 2 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
GOOGLE_API_KEY =
GOOGLE_CSE_ID =
47 changes: 37 additions & 10 deletions finesse/accuracy_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
from collections import namedtuple
import regex as re
from finesse.google_search import search_google_urls

OUTPUT_FOLDER = "./finesse/output"
AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"])
Expand Down Expand Up @@ -34,22 +35,22 @@ def save_to_markdown(test_data: dict, engine: str):
with open(output_file, "w") as md_file:
md_file.write(f"# Test on the {engine} search engine: {date_string}\n\n")
md_file.write("## Test data table\n\n")
md_file.write("| 📄 File | 💬 Question | 📏 Accuracy Score | ⌛ Time |\n")
md_file.write("| 📄 File | 💬 Question | 📏 Accuracy Score | 🌐 Google Score |⌛ Time |\n")
md_file.write("|--------------------|-------------------------------------------------------------------------------------------------------------------------|----------------|----------|\n")
for key, value in test_data.items():
md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {int(value.get('time'))}ms |\n")
md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')}) | {value.get('accuracy')*100}% | {value.get('google_accuracy')*100}% |{int(value.get('time'))}ms |\n")
md_file.write("\n")
md_file.write(f"Tested on {len(test_data)} files.\n\n")

time_stats, accuracy_stats = calculate_statistical_summary(test_data)
time_stats, accuracy_stats, google_stats = calculate_statistical_summary(test_data)
md_file.write("## Statistical summary\n\n")
md_file.write("| Statistic | Time | Accuracy score|\n")
md_file.write("| Statistic | Time | 📏 Accuracy score| 🌐 Google Score |\n")
md_file.write("|-----------------------|------------|---------|\n")
md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |\n")
md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% |\n")
md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% |\n")
md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% |\n")
md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% |\n")
md_file.write(f"|Mean| {int(time_stats.get('Mean'))}ms | {int(accuracy_stats.get('Mean')*100)}% |{int(google_stats.get('Mean')*100)}% |\n")
md_file.write(f"|Median| {int(time_stats.get('Median'))}ms | {int(accuracy_stats.get('Median')*100)}% | {int(google_stats.get('Median')*100)}% |\n")
md_file.write(f"|Standard Deviation| {int(time_stats.get('Standard Deviation'))}ms | {int(accuracy_stats.get('Standard Deviation')*100)}% | {int(google_stats.get('Standard Deviation')*100)}% |\n")
md_file.write(f"|Maximum| {int(time_stats.get('Maximum'))}ms | {int(accuracy_stats.get('Maximum')*100)}% | {int(google_stats.get('Maximum')*100)}% |\n")
md_file.write(f"|Minimum| {int(time_stats.get('Minimum'))}ms | {int(accuracy_stats.get('Minimum')*100)}% | {int(google_stats.get('Minimum')*100)}% |\n")
md_file.write(f"\nThere are a total of {len([result.get('accuracy') for result in test_data.values() if result.get('accuracy') == 0])} null scores\n")

def save_to_csv(test_data: dict, engine: str):
Expand Down Expand Up @@ -101,6 +102,7 @@ def log_data(test_data: dict):
def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]:
times = [result.get("time") for result in test_data.values()]
accuracies = [result.get("accuracy") for result in test_data.values()]
google = [result.get("google_accuracy") for result in test_data.values()]
time_stats = {
"Mean": round(statistics.mean(times), 3),
"Median": round(statistics.median(times), 3),
Expand All @@ -115,4 +117,29 @@ def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]:
"Maximum": round(max(accuracies), 2),
"Minimum": round(min(accuracies), 2),
}
return time_stats, accuracy_stats
google_stats= {
"Mean": round(statistics.mean(google), 2),
"Median": round(statistics.median(google), 2),
"Standard Deviation": round(statistics.stdev(google), 2),
"Maximum": round(max(google), 2),
"Minimum": round(min(google), 2),
}
return time_stats, accuracy_stats, google_stats

def update_dict_google_data(test_data: dict):
"""
Updates the given test_data dictionary with the Google accuracy results.
Args:
test_data (dict): The dictionary containing the test data.
"""
count = 0
for key, value in test_data.items():
question = value.get("question")
expected_url = value.get("expected_page").get("url")
top = value.get("top")
google_response_url = search_google_urls(question, top)
google_accuracy_result = calculate_accuracy(google_response_url, expected_url)
value["google_accuracy"] = google_accuracy_result.score
count += 1
print(f"{count} file is done")
11 changes: 6 additions & 5 deletions finesse/finesse_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
from jsonreader import JSONReader
import os
import json
from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy
from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy, update_dict_google_data
from host import is_host_up
from google_search import get_google_search_urls

class NoTestDataError(Exception):
"""Raised when all requests have failed and there is no test data"""
Expand Down Expand Up @@ -67,8 +66,6 @@ def search_accuracy(self):
for page in response_pages:
response_url.append(page.get("url"))
accuracy_result = calculate_accuracy(response_url, expected_url)
google_response_url = get_google_search_urls(question)
google_accuracy_result = calculate_accuracy(google_response_url, expected_url)
time_taken = round(response.elapsed.microseconds/1000,3)

expected_page = json_data.copy()
Expand All @@ -81,8 +78,8 @@ def search_accuracy(self):
"position": accuracy_result.position,
"total_pages": accuracy_result.total_pages,
"accuracy": accuracy_result.score,
"google_accuracy": google_accuracy_result,
"time": time_taken,
"top": self.top
}

def on_start(self):
Expand All @@ -93,6 +90,10 @@ def on_stop(self):
if not self.qna_results:
raise NoTestDataError

print("Search accuracy test completed")
print("Starting google search test")

update_dict_google_data(self.qna_results)
log_data(self.qna_results)
if self.format == "md":
save_to_markdown(self.qna_results, self.engine)
Expand Down
25 changes: 19 additions & 6 deletions finesse/google_search.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
from googlesearch import search
from googleapiclient.discovery import build
from dotenv import load_dotenv
import os

def get_google_search_urls(query: str, num_results: int = 100) -> list[str]:
def google_search(search_term, api_key, cse_id, **kwargs):
service = build("customsearch", "v1", developerKey=api_key)
res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
return res['items']

def search_google_urls(query: str, num_results: int = 100) -> list[str]:
"""
Retrieves a list of Google search result URLs for the given query.
Retrieves a list of Google search result URLs for the given query using the Google API.
Args:
query (str): The search query.
num_results (int, optional): The number of search results to retrieve. Defaults to 100.
Returns:
list[str]: A list of URLs representing the search results.
Raises:
Exception: If the request limit is exceeded (error 429 Too Many Requests).
"""
num_results -= 2 # 2 extra urls are added by googlesearch library
load_dotenv()
links = []
for url in search(query, num_results, sleep_interval=1):
links.append(url)
api_key = os.getenv("GOOGLE_API_KEY")
cse_id = os.getenv("GOOGLE_CSE_ID")
results = google_search(query, api_key, cse_id, start=11)
for item in results:
links.append(item['link'])
return links
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
locust
regex
googlesearch-python
google-api-python-client
python-dotenv
6 changes: 3 additions & 3 deletions tests/test_google_search.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import unittest
from finesse.google_search import get_google_search_urls
from finesse.google_search import search_google_urls

class TestGoogleSearch(unittest.TestCase):
def test_get_google_search_urls(self):
query = "Canada Food Inspection Agency"
num_results = 10
urls = get_google_search_urls(query, num_results)
num_results = 100
urls = search_google_urls(query, num_results)
self.assertEqual(len(urls), num_results)
self.assertTrue(all(url.startswith("http") for url in urls))

Expand Down

0 comments on commit 232208a

Please sign in to comment.