-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoogle_data1.py
58 lines (47 loc) · 1.52 KB
/
google_data1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
def check_and_remove_social_media(url):
parsed_url = urlparse(url)
if parsed_url.netloc.endswith('facebook.com'):
return False
elif parsed_url.netloc.endswith('youtube.com'):
print("This URL belongs to YouTube.")
return False
elif parsed_url.netloc.endswith('linkedin.com'):
print("This URL belongs to LinkedIn.")
return False
elif parsed_url.netloc.endswith('twitter.com'):
return False
else:
return True
def search_links(query):
try:
from googlesearch import search
except ImportError:
print("No module named 'google' found")
# to search
#print (help(search))
urls = []
for j in search(query, tld="co.in", num=4, stop=4, pause=2):
urls.append(j)
urls = list(filter(check_and_remove_social_media, urls))
return urls
def scrape_p_tags(urls):
all_text = ""
for url in urls:
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all <p> tags and extract text
p_tags = soup.find_all('p')
for p_tag in p_tags:
all_text += p_tag.get_text() + "\n"
except Exception as e:
print(f"Error scraping {url}: {e}")
return all_text
def google_Data(query): # main
#query = "who is ms dhoni"
urls=search_links(query)
combined_text = scrape_p_tags(urls)
return (combined_text)