-
Notifications
You must be signed in to change notification settings - Fork 0
/
image_scrape.py
70 lines (60 loc) · 2.23 KB
/
image_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
import requests
from bs4 import BeautifulSoup
import locale
from readFile import *
import pandas as pd
import numpy as np
from csv import writer
import json
import requests
from PIL import Image
from io import BytesIO
import json
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
def all_links_of_images(url):
# Launch the Chrome browser
options = webdriver.ChromeOptions()
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
options.add_argument('--headless=new')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)
# Navigate to a web page
requests.get(url)
driver.get(url)
# Scroll to the bottom of the page to load all content
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Get the HTML of the fully-loaded page
html = driver.page_source
driver.close()
driver.quit()
soup = BeautifulSoup(html, "html.parser")
# Find all links to projects on the page
project_links = soup.find_all("class", {"img": "ImageElement-image-SRv"})
# print(project_links)
image_link_list = []
# Loop through each project link and find the link to the project's image
for link in project_links:
project_url = link["src"]
image_link_list.append(project_url)
return image_link_list
#("https://www.behance.net/search?tracking_source=typeahead_search_direct&search=advertising++poster/post+card", )
ad_url ="https://www.behance.net/gallery/127057601/Blog-illustrations-for-Twingate"
print(all_links_of_images(ad_url))