-
Notifications
You must be signed in to change notification settings - Fork 0
/
business_card.py
72 lines (55 loc) · 2.39 KB
/
business_card.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.select import Select
import time
#given a web page url, return a list of links to all the {ad_class_name} on that web page.
#usage: return a list of links of all ads on advertising page
# Launch the Chrome browser
url = "https://www.behance.net/search?tracking_source=typeahead_search_direct&search=advertising++business+card"
# Launch the Chrome browser
options = webdriver.ChromeOptions()
options.add_argument('--blink-settings=imagesEnabled=false')
# or alternatively we can set direct preference:
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# options = Options()
options.add_argument('--headless=new')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
# options.add_argument('--blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(options=options)
# Navigate to a web page
requests.get(url)
driver.get(url)
#Scroll to the bottom of the page to load all content
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Get the HTML of the fully-loaded page
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
# Find all links to projects on the page
project_links = soup.find_all('a', {'class': "ProjectCoverNeue-coverLink-U39 js-project-cover-image-link js-project-link e2e-ProjectCoverNeue-link"})
# print(project_links)
image_link_list = []
# Loop through each project link and find the link to the project's image
for link in project_links:
project_url = link["href"]
image_link_list.append(project_url)
print(image_link_list)
driver.quit()
# ad_url ="https://www.behance.net/search/projects?field=advertising&"
# ad_class = "ProjectCoverNeue-coverLink-U39 js-project-cover-image-link js-project-link e2e-ProjectCoverNeue-link"
# ad_link_type = "a"
# inner_type = "class"
# all_links_of_images(ad_url,ad_link_type,inner_type, ad_class)