-
Notifications
You must be signed in to change notification settings - Fork 0
/
images_links.py
40 lines (38 loc) · 1.92 KB
/
images_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import requests
from bs4 import BeautifulSoup
import scrapper
import sys
# This function get the links of images of rocket engines
def images_links():
engines = scrapper()[1]
print("Getting images links...")
paths = ()
wikipedia_pages = {}
c = 0
for engine in engines:
pic_url = ""
engine_name = engine[0]
goog_search = "https://www.google.com/search?sclient=psy-ab&client=ubuntu&hs=k5b&channel=fs&biw=1366&bih=648&noj=1&q=" + engine_name.replace(" ", "+") + "+rocket+engine+wikipedia"
r = requests.get(goog_search)
soup = BeautifulSoup(r.text, "html.parser")
search_result = soup.find_all("a")
for i in range(len(search_result)):
if "https://en.wikipedia.org/wiki/" in str(search_result[i]):
the_link = search_result[i]
break
if the_link:
wikipedia_link = str(the_link.get('href')).removeprefix("/url?q=").split("&")[0]
wikipedia_pages[engine_name] = wikipedia_link
r = requests.get(wikipedia_link)
soup = BeautifulSoup(r.text, "html.parser")
if soup.find("table", {"class" : "infobox"}):
if soup.find("table", {"class" : "infobox"}).find("td", {"class": "infobox-image"}):
if soup.find("table", {"class" : "infobox"}).find("td", {"class": "infobox-image"}).find("img").get("srcset"):
if len(soup.find("table", {"class" : "infobox"}).find("td", {"class": "infobox-image"}).find("img").get("srcset").split()) > 2 and "Aeon" not in engine_name:
pic_url = "https:" + str(soup.find("table", {"class" : "infobox"}).find("td", {"class": "infobox-image"}).find("img").get("srcset").split()[2])
c += 1
print(f"{int(100 * (c/len(engines)))}% done")
paths += (pic_url, )
print(f"100% done")
return paths, wikipedia_pages
sys.modules[__name__] = images_links