-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
95 lines (73 loc) · 3.37 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import os.path
import time
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
import chromedriver_autoinstaller
def scrape_colors_and_save_data_as_json() -> int:
chromedriver_autoinstaller.install()
chrome_options = Options()
options = [
"--headless",
"--disable-gpu",
"--window-size=1920,1200",
"--ignore-certificate-errors",
"--disable-extensions",
"--no-sandbox",
"--disable-dev-shm-usage"
]
for option in options:
chrome_options.add_argument(option)
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://coolors.co/colors")
delay = 2.5
try:
WebDriverWait(driver, delay).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="iubenda-cs-banner"]/div/div/div/div[3]/div[2]/button')))
print("The page is ready!")
except TimeoutException:
print("The page did not load on time.")
driver.find_element(By.XPATH, '//*[@id="iubenda-cs-banner"]/div/div/div/div[3]/div[2]/button').click()
driver.find_element(By.XPATH, '//*[@id="modal-fabrizio"]/div/div[2]/div/a').click()
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
ActionChains(driver).send_keys(Keys.END).perform()
time.sleep(0.25)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
colors = driver.find_elements(By.XPATH, '//*[@id="colors_results"]/div')
scraped_colors_array = {"colors": []}
scraped_colors = {}
for color in colors:
hex_code_element = color.find_element(By.XPATH, ".//*//span")
hex_code = hex_code_element.get_attribute('innerHTML')
if len(hex_code) > 6:
continue
color_name = color.text.upper()
print(f"hex: {hex_code}, name: {color_name}")
scraped_colors_array["colors"].append({"hex": hex_code, "name": color_name})
scraped_colors[hex_code] = color_name
path = os.path.join(Path(__file__).parent, "data")
json_scraped_colors_array = json.dumps(scraped_colors, indent=4, ensure_ascii=False)
with open(os.path.join(path, "colors.json"), "w", encoding="utf-8") as file:
file.write(json_scraped_colors_array)
json_min_scraped_colors_array = json.dumps(scraped_colors, ensure_ascii=False)
with open(os.path.join(path, "colors.min.json"), "w", encoding="utf-8") as file:
file.write(json_min_scraped_colors_array)
json_scraped_colors = json.dumps(scraped_colors_array, indent=4, ensure_ascii=False)
with open(os.path.join(path, "colors_array.json"), "w", encoding="utf-8") as file:
file.write(json_scraped_colors)
json_min_scraped_colors = json.dumps(scraped_colors_array, ensure_ascii=False)
with open(os.path.join(path, "colors_array.min.json"), "w", encoding="utf-8") as file:
file.write(json_min_scraped_colors)
return len(scraped_colors)
# 540 colors