-
Notifications
You must be signed in to change notification settings - Fork 0
/
appreciation_scrape_links.py
78 lines (58 loc) · 2.5 KB
/
appreciation_scrape_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
import requests
from bs4 import BeautifulSoup
import locale
from readFile import *
import pandas as pd
import numpy as np
#given a web page url, return a list of links to all the {ad_class_name} on that web page.
#usage: return a list of links of all ads on advertising page
def all_links_of_appreciations(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
#num_appreciation = int(soup.findAll('td',{'class':'UserInfo-statColumn-NsR UserInfo-statValue-d3q'})[1].text)
# Launch the Chrome browser
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--headless=new')
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)
# driver = webdriver.Chrome()
# Navigate to a web page
requests.get(url)
driver.get(url)
time.sleep(6)
works = driver.find_elements(By.CSS_SELECTOR,'a.AppreciationCover-coverLink-x1o js-project-cover-image-link')
num_works= len(works)
print(num_works)
# # Scroll to the bottom of the page to load all content
# last_height = driver.execute_script("return document.body.scrollHeight")
# while True:
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# time.sleep(2)
# new_height = driver.execute_script("return document.body.scrollHeight")
# if new_height == last_height:
# break
# last_height = new_height
# # Get the HTML of the fully-loaded page
# html = driver.page_source
# driver.close()
# driver.quit()
# soup = BeautifulSoup(html, "html.parser")
# # Find all links to projects on the page
# project_links = soup.find_all(div_type, {inner_type: ad_class_name})
# # print(project_links)
# image_link_list = []
# # Loop through each project link and find the link to the project's image
# for link in project_links:
# project_url = link["href"]
# image_link_list.append(project_url)
# return image_link_list
ad_url ="https://www.behance.net/Vertexer/appreciated"
print(all_links_of_appreciations(ad_url))