-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #9 from ai-cfia/6-compare-finesses-score-against-g…
…oogles Incorporate Public Search Engine Comparison
- Loading branch information
Showing
11 changed files
with
420 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
BING_SEARCH_KEY = | ||
BING_ENDPOINT = |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from azure.cognitiveservices.search.websearch import WebSearchClient | ||
from msrest.authentication import CognitiveServicesCredentials | ||
import time | ||
import statistics | ||
class BingSearch(): | ||
""" | ||
A class for performing web searches using the Bing Search API. | ||
""" | ||
|
||
def __init__(self, endpoint, subscription_key): | ||
self.endpoint = endpoint | ||
self.subscription_key = subscription_key | ||
self.client = WebSearchClient(endpoint=self.endpoint, credentials=CognitiveServicesCredentials(self.subscription_key)) | ||
self.client.config.base_url = '{Endpoint}/v7.0' # Temporary change to fix the error. Issue opened https://github.com/Azure/azure-sdk-for-python/issues/34917 | ||
|
||
def search_urls(self, query: str, num_results: int = 100) -> tuple[list[str], float]: | ||
""" | ||
Search for URLs using the Bing Search API. | ||
Args: | ||
query (str): The search query. | ||
num_results (int, optional): The number of results to retrieve. Defaults to 100. | ||
Returns: | ||
tuple[list[str], float]: A tuple containing a list of URLs and the average elapsed time for the search. | ||
""" | ||
urls = [] | ||
elapsed_time = [] | ||
offset = 0 | ||
# Limit of 50 results per query and Bing Search return less than 50 web results | ||
while len(urls) < num_results: | ||
start_time = time.time() | ||
web_data = self.client.web.search(query=query, market="en-ca", count=50, response_filter=["Webpages"], offset=offset) | ||
elapsed_time.append(time.time() - start_time) | ||
if hasattr(web_data, 'web_pages') and web_data.web_pages is not None: | ||
urls.extend([item.url for item in web_data.web_pages.value]) | ||
try: | ||
offset += len([item.url for item in web_data.web_pages.value]) | ||
except AttributeError: | ||
break | ||
urls = urls[:num_results] | ||
return urls, statistics.mean(elapsed_time) * 1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import openpyxl | ||
import os | ||
import json | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser(description='XLSX Converter to JSON') | ||
parser.add_argument('--input-folder', dest='input_folder', default='finesse/scripts/input/', help='Path to the input folder') | ||
parser.add_argument('--output-folder', dest='output_folder', default='finesse/scripts/output/', help='Path to the output folder') | ||
parser.add_argument('--file-name', dest='file_name', help='Name of the input file') | ||
parser.add_argument('--sheet-name', dest='sheet_name', default='To fill', help='Name of the worksheet') | ||
|
||
args = parser.parse_args() | ||
|
||
INPUT_FOLDER = args.input_folder | ||
OUTPUT_FOLDER = args.output_folder | ||
FILE_NAME = args.file_name | ||
SHEET_NAME = args.sheet_name | ||
FILE_PATH = os.path.join(INPUT_FOLDER, FILE_NAME) | ||
|
||
workbook = openpyxl.load_workbook(FILE_PATH) | ||
worksheet = workbook.active | ||
count = 1 | ||
|
||
for row in range(5, worksheet.max_row + 1): | ||
question = worksheet.cell(row=row, column=2).value | ||
if question is None: | ||
continue | ||
|
||
answer = worksheet.cell(row=row, column=3).value | ||
|
||
titles = [] | ||
links = [] | ||
for col in range(5, 10): | ||
title = worksheet.cell(row=row, column=col).value | ||
link = worksheet.cell(row=row, column=col).hyperlink | ||
if title: | ||
titles.append(title) | ||
if link: | ||
links.append(link.target) | ||
|
||
data = { | ||
'question': question or "", | ||
'answer': answer or "", | ||
'title': titles[0] if len(titles) == 1 else titles or "", | ||
'url': links[0] if len(links) == 1 else links or "" | ||
} | ||
|
||
# Enregistrement du fichier JSON | ||
output_file = os.path.join(OUTPUT_FOLDER, f'question_{count}.json') | ||
if not os.path.exists(OUTPUT_FOLDER): | ||
os.makedirs(OUTPUT_FOLDER) | ||
with open(output_file, 'w', encoding='utf-8') as json_file: | ||
json.dump(data, json_file, ensure_ascii=False, indent=4) | ||
count += 1 | ||
|
||
print("Conversion completed successfully!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,7 @@ | ||
locust | ||
regex | ||
python-dotenv | ||
azure-cognitiveservices-search-websearch | ||
msrest | ||
openpyxl | ||
natsort |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import unittest | ||
from finesse.bing_search import BingSearch | ||
from dotenv import load_dotenv | ||
import os | ||
class TestBingSearch(unittest.TestCase): | ||
def test_search_urls(self): | ||
load_dotenv() | ||
endpoint = os.getenv("BING_ENDPOINT") | ||
subscription_key = os.getenv("BING_SEARCH_KEY") | ||
bing_search = BingSearch(endpoint, subscription_key) | ||
|
||
query = "Canadian Food Inspection Agency" | ||
num_results = 100 | ||
|
||
urls, elapsed_time = bing_search.search_urls(query, num_results) | ||
|
||
self.assertEqual(len(urls), num_results) | ||
self.assertTrue(all(url.startswith("http") for url in urls)) | ||
self.assertIsInstance(elapsed_time, float) | ||
|
||
if __name__ == "__main__": | ||
unittest.main() |