diff --git a/finesse/bing_search.py b/finesse/bing_search.py new file mode 100644 index 0000000..1876709 --- /dev/null +++ b/finesse/bing_search.py @@ -0,0 +1,44 @@ +import requests +import random + +def get_bing_search_urls(query: str, num_results: int = 100) -> list[str]: + urls = [] + headers = {'User-Agent': get_useragent()} + cookies = get_cookies() + url = f"https://www.google.com/search?q={query}&num={num_results}" + res = requests.get(url, headers=headers, cookies=cookies) + if res.status_code == 200: + urls.append(res.url) + else: + raise requests.exceptions.HTTPError(res.status_code, res.url) + return urls + +def get_cookies(): + """ + Generates cookies to avoid getting blocked during search. + Returns: + dict: A dictionary containing the cookies. + + Raises: + requests.exceptions.HTTPError: If the response status code is not 200. + """ + trend_url = 'https://youtube.com' + response = requests.get(trend_url) + if response.status_code == 200: + return response.cookies.get_dict() + else: + raise requests.exceptions.HTTPError(f'Cookies raised {response.status_code}') + + + +def get_useragent(): + USERAGENT_LIST = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' + ] + return random.choice(USERAGENT_LIST) diff --git a/finesse/google_search.py b/finesse/google_search.py deleted file mode 100644 index eb9f5ed..0000000 --- a/finesse/google_search.py +++ /dev/null @@ -1,7 +0,0 @@ -from googlesearch import search - -def get_google_search_urls(query: str, num_results: int = 100) -> list[str]: - links = [] - for url in search(query, num_results, sleep_interval=1): - links.append(url) - return links diff --git a/tests/test_google_search.py b/tests/test_bing_search.py similarity index 65% rename from tests/test_google_search.py rename to tests/test_bing_search.py index 901d99d..6342b8a 100644 --- a/tests/test_google_search.py +++ b/tests/test_bing_search.py @@ -1,11 +1,11 @@ import unittest -from finesse.google_search import get_google_search_urls +from finesse.bing_search import get_bing_search_urls -class TestGoogleSearch(unittest.TestCase): +class TestBingSearch(unittest.TestCase): def test_get_google_search_urls(self): query = "Canada Food Inspection Agency" num_results = 10 - urls = get_google_search_urls(query, num_results) + urls = get_bing_search_urls(query, num_results) self.assertEqual(len(urls), num_results) self.assertTrue(all(url.startswith("http") for url in urls))