-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathkindle_scraper.py
105 lines (82 loc) · 3.42 KB
/
kindle_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
import time
import requests
import os
def get_highlights(email, password):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
chrome_options.add_argument("--no-sandbox")
# DRIVER_PATH = '/Users/lucasgen/Downloads/chromedriver'
# driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=chrome_options)
chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options)
print("getting driver---------------")
driver.get('https://read.amazon.com/kp/notebook')
print(driver.title)
email_input = driver.find_element(By.XPATH, '//*[@id="ap_email"]')
pass_input = driver.find_element(By.XPATH, '//*[@id="ap_password"]')
email_input.send_keys(email)
pass_input.send_keys(password)
pass_input.send_keys(Keys.ENTER)
print("logging in...")
# wait for page to load
elem = WebDriverWait(driver, 45).until(
EC.presence_of_element_located((By.ID, "library-section"))
)
print("logged in")
def get_book_list():
books = []
books = driver.find_elements(By.XPATH, "//div[contains(@class, 'kp-notebook-library-each-book')]")
return books
print("retrieving books...")
books = get_book_list()
book_highlights = {}
i = 0
while True:
if i >= len(books): break
book = books[i]
title = book.text.splitlines()[0]
author = book.text.splitlines()[1][4:]
print(title)
print(author)
i += 1
book.click()
# wait for page to load
elem = WebDriverWait(driver, 45).until(
EC.presence_of_element_located((By.XPATH, "//span[contains(@id, 'highlight')]"))
)
highlights = []
highlight_text = []
highlights = driver.find_elements(
By.XPATH,
"//span[@id='highlight' or @id='note']"
)
# options = []
# options = driver.find_elements(By.XPATH, "//a[text()='Options']")
# for option in options:
# option.click()
# link = driver.find_element(By.XPATH, "//a[text()='Open in Kindle']")
# link = link.get_attribute("href")
# print("LINK--------------------")
# print(link)
# skip the first one since it's just a number
for highlight in highlights:
if highlight.text != "":
if highlight.get_attribute("id") == "highlight":
highlight_text.append((highlight.text, "highlight"))
else:
highlight_text.append((highlight.text, "note"))
book_highlights[title] = {
"highlights": highlight_text,
"author": author,
}
books = get_book_list()
return book_highlights