forked from chuanenlin/shutterscrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
shutterscrape.py
124 lines (119 loc) · 5.17 KB
/
shutterscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from urllib import urlretrieve
import os
import Tkinter, Tkconstants, tkFileDialog
import time
def videoscrape():
try:
driver = webdriver.Chrome()
driver.maximize_window()
for i in range(1, searchPage + 1):
url = "https://www.shutterstock.com/video/search/" + searchTerm + "?page=" + str(i)
driver.get(url)
print("Page " + str(i))
for j in range (0, 50):
while True:
container = driver.find_elements_by_xpath("//div[@data-automation='VideoGrid_video_videoClipPreview_" + str(j) + "']")
if len(container) != 0:
break
if len(driver.find_elements_by_xpath("//div[@data-automation='VideoGrid_video_videoClipPreview_" + str(j + 1) + "']")) == 0 and i == searchPage:
driver.close()
return
time.sleep(10)
driver.get(url)
container[0].click()
while True:
wait = WebDriverWait(driver, 60).until(ec.visibility_of_element_located((By.XPATH, "//video[@data-automation='VideoPlayer_video_video']")))
video_url = driver.current_url
data = driver.execute_script("return document.documentElement.outerHTML")
scraper = BeautifulSoup(data, "lxml")
video_container = scraper.find_all("video", {"data-automation":"VideoPlayer_video_video"})
if len(video_container) != 0:
break
time.sleep(10)
driver.get(video_url)
video_array = video_container[0].find_all("source")
video_src = video_array[1].get("src")
name = video_src.rsplit("/", 1)[-1]
try:
urlretrieve(video_src, os.path.join(scrape_directory, os.path.basename(video_src)))
print("Scraped " + name)
except Exception as e:
print(e)
driver.get(url)
except Exception as e:
print(e)
def imagescrape():
try:
driver = webdriver.Chrome()
driver.maximize_window()
for i in range(1, searchPage + 1):
url = "https://www.shutterstock.com/search?searchterm=" + searchTerm + "&sort=popular&image_type=all&search_source=base_landing_page&language=en&page=" + str(i)
driver.get(url)
data = driver.execute_script("return document.documentElement.outerHTML")
print("Page " + str(i))
scraper = BeautifulSoup(data, "lxml")
img_container = scraper.find_all("div", {"class":"img-wrap"})
for j in range(0, len(img_container)-1):
img_array = img_container[j].find_all("img")
img_src = img_array[0].get("src")
name = img_src.rsplit("/", 1)[-1]
try:
urlretrieve(img_src, os.path.join(scrape_directory, os.path.basename(img_src)))
print("Scraped " + name)
except Exception as e:
print(e)
driver.close()
except Exception as e:
print(e)
print("ShutterScrape v1.1")
#scrape_directory = "C:/Users/[username]/[path]"
while True:
while True:
print("Please select a directory to save your scraped files.")
scrape_directory = tkFileDialog.askdirectory()
if scrape_directory == None or scrape_directory == "":
print("You must select a directory to save your scraped files.")
continue
break
while True:
searchMode = raw_input("Search mode ('v' for video or 'i' for image): ")
if searchMode != "v" and searchMode != "i":
print("You must select 'v' for video or 'i' for image.")
continue
break
while True:
searchCount = input("Number of search terms: ")
if searchCount < 1:
print("You must have at least one search term.")
continue
elif searchCount == 1:
searchTerm = raw_input("Search term: ")
else:
searchTerm = raw_input("Search term 1: ")
for i in range (1, searchCount):
searchTermPart = raw_input("Search term " + str(i + 1) + ": ")
if searchMode == "v":
searchTerm += "-" + searchTermPart
if searchMode == "i":
searchTerm += "+" + searchTermPart
break
while True:
searchPage = input("Number of pages to scrape: ")
if searchPage < 1:
print("You must have scrape at least one page.")
continue
break
if searchMode == "v":
videoscrape()
if searchMode == "i":
imagescrape()
print("Scraping complete.")
restartScrape = raw_input("Keep scraping? ('y' for yes or 'n' for no) ")
if restartScrape == "n":
print("Scraping ended.")
break