Skip to content

Commit

Permalink
issue #6: bing search by scrapping
Browse files Browse the repository at this point in the history
  • Loading branch information
ibrahim-kabir committed Mar 22, 2024
1 parent 699e5ca commit 7e5557b
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 10 deletions.
44 changes: 44 additions & 0 deletions finesse/bing_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import requests
import random

def get_bing_search_urls(query: str, num_results: int = 100) -> list[str]:
urls = []
headers = {'User-Agent': get_useragent()}
cookies = get_cookies()
url = f"https://www.google.com/search?q={query}&num={num_results}"
res = requests.get(url, headers=headers, cookies=cookies)
if res.status_code == 200:
urls.append(res.url)
else:
raise requests.exceptions.HTTPError(res.status_code, res.url)
return urls

def get_cookies():
"""
Generates cookies to avoid getting blocked during search.
Returns:
dict: A dictionary containing the cookies.
Raises:
requests.exceptions.HTTPError: If the response status code is not 200.
"""
trend_url = 'https://youtube.com'
response = requests.get(trend_url)
if response.status_code == 200:
return response.cookies.get_dict()
else:
raise requests.exceptions.HTTPError(f'Cookies raised {response.status_code}')



def get_useragent():
USERAGENT_LIST = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
]
return random.choice(USERAGENT_LIST)
7 changes: 0 additions & 7 deletions finesse/google_search.py

This file was deleted.

6 changes: 3 additions & 3 deletions tests/test_google_search.py → tests/test_bing_search.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import unittest
from finesse.google_search import get_google_search_urls
from finesse.bing_search import get_bing_search_urls

class TestGoogleSearch(unittest.TestCase):
class TestBingSearch(unittest.TestCase):
def test_get_google_search_urls(self):
query = "Canada Food Inspection Agency"
num_results = 10
urls = get_google_search_urls(query, num_results)
urls = get_bing_search_urls(query, num_results)
self.assertEqual(len(urls), num_results)
self.assertTrue(all(url.startswith("http") for url in urls))

Expand Down

0 comments on commit 7e5557b

Please sign in to comment.