-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape.py
398 lines (340 loc) · 17.8 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
# Import modules we will be using
import configparser
import requests
import mimetypes
import os
import sys
import time
import urllib.request
from urllib.parse import urlparse
from urllib.parse import unquote
from selenium import webdriver
# Check the status response of provided link
# Return 'Good' if 200 or 406, return Null/None if provided empty string, otherwise return the error code
def validLinkCheck(link):
# Reset values to None/Null
status = None
requestResult = None
# If they didn't send us an empty string, let's check it out
if (link != ""):
# Try requesting the webpage and see what it returns
try:
request = requests.get(link)
requestResult = request.status_code
except Exception as errorCode:
requestResult = errorCode
# Return 'Good' or an error code based on whether the status code was good or bad
# Note: In the case of 406 errors, while something about the page is reported as 'not acceptable', the page will be served, and can therefore be scraped (sometimes...? :/)
if (requestResult == 200) or (requestResult == 406):
# Nice! The page exists and returns a good code; return 'Good'
status = "Good"
else:
# Trying to access the URL failed for one reason or another; return that reason
status = str(requestResult)
return status
# Replace various characters that would be illegal in a filename (Windows)
def sanitizeString(stringToClean):
naughtyCharList = ['/','>','<',':','"','|','?','*','\\','#','–','\n','\r']
replacementChar = "-"
# Iterate through each naughty option and replace any instances of it in stringToClean
for elem in naughtyCharList :
# Check if string is in the main string
if elem in stringToClean :
# Replace the portion of the string that matches the neughty list
stringToClean = stringToClean.replace(elem, replacementChar)
return stringToClean
# Check if the config file exists (If not, exit script)
if not os.path.isfile(sys.argv[1]):
print("Please provide a valid file for the config.\nYour provided: " + str(sys.argv[1]))
raise SystemExit(0)
else:
print("Config file found.")
configFilePath = sys.argv[1]
# Pull in variables from config file
configparser = configparser.RawConfigParser()
configparser.read(configFilePath)
# General group
outputPath = configparser.get('General', 'outputPath')
subfolderToggle = configparser.get('General', 'subfolderToggle')
getComments = configparser.get('General', 'getComments')
getImage = configparser.get('General', 'getImage')
imageNameType = configparser.get('General', 'imageNameType')
runHeadless = configparser.get('General', 'runHeadless')
buildHTML = configparser.get('General', 'buildHTML')
# Comic group
comicName = configparser.get('Comic', 'comicName')
comicStartPage = configparser.get('Comic', 'comicStartPage')
commentPath = configparser.get('Comic', 'commentPath')
imageTitlePath = configparser.get('Comic', 'imageTitlePath')
nextButtonPath = configparser.get('Comic', 'nextButtonPath')
nextButtonType = configparser.get('Comic', 'nextButtonType')
imagePath = configparser.get('Comic', 'imagePath')
initialClick = configparser.get('Comic', 'initialClick')
print("Config settings imported.")
# If the user requested a subfolder, append it to the current outputPath
if subfolderToggle == "True":
outputPath = os.path.join(outputPath, comicName)
# Check if the desired output directory exists, if not, create it
if not os.path.isdir(outputPath):
print("Output directory does not yet exist, creating.")
os.mkdir(outputPath)
else:
print("Output directory found.")
# Set up the browser we'll be using
driverOptions = webdriver.FirefoxOptions()
driverOptions.set_preference("general.useragent.override", "Not Quite Daily Scraper")
if runHeadless == "True":
driverOptions.headless = True
elif runHeadless == "False":
driverOptions.headless = False
driver = webdriver.Firefox(options=driverOptions)
print("Browser created. Beginning to scrape " + comicName)
# Prep the loop variables
comicCommentHTML = ""
endLoop = False
# Attempt to load the comicStartPage page, if successful begin loop, if not skip loop (exiting the script)
pageStatus = validLinkCheck(comicStartPage)
if (pageStatus == "Good"):
# Nice! The page exists and returns a good code
print('\nAccessing start page: ' + comicStartPage)
else:
# If we can't find the current page, let the user know and break out of the while loop
print('\nStart page unavailable: ' + comicStartPage + "\nRequest yielded: " + pageStatus)
endLoop = True
# If we found the page, let's open it in Gecko to start our parsing
driver.get(comicStartPage)
# If the user requested we perform an initial click, do so!
if (initialClick != ""):
try:
initialClickContent = driver.find_elements_by_xpath(initialClick)
initialURL = initialClickContent[0].get_attribute('href')
print('Clicking initial-click: ' + initialURL)
driver.get(initialURL)
except:
try:
javaInitialButton = initialClickContent[0].click()
print('Clicking initial-click: ' + driver.current_url)
# This sleep alleviates a scenario where the javaClick could cycle a page
# without giving the page time to actually respond,
# resulting in an infinite cycle of incrementing, blanks pages. ().o
time.sleep(1.5)
except:
print(' Initialclick failed. Ending.')
endLoop = True
# Clear the variables we'll be setting each loop to check for repeated errors, and build the 'next page' link
seenURLs = []
mostRecentImageURL = "-"
txtSavePathName = ""
previousTxtSavePath = ""
overwriteCount = 0
consecutiveSkipCount = 0
while endLoop == False:
# Record the URL of the current page, regardless of navigation method
currentPageURL = driver.current_url
# Grab content elements based on the paths provided (skip ones the user has turned off)
if getComments == "True":
comicComment = driver.find_elements_by_xpath(commentPath)
if getImage == "True":
comicImage = driver.find_elements_by_xpath(imagePath)
if imageTitlePath != "":
imageTitle = driver.find_elements_by_xpath(imageTitlePath)
else:
imageTitle = ""
# Extract comment html, the image url, and the next button url out of those elements (skipping ones the user has turned off)
# Also save a copy of what the comment was last time, so we don't write out duplicates
previousCommentHTML = comicCommentHTML
if getComments == "True":
# If there isn't anything in the comment element, just leave it blank
if len(comicComment) < 1:
comicCommentHTML = ""
else:
comicCommentHTML = comicComment[0].get_attribute('innerHTML')
if getImage == "True":
# Try to get the comic image URL
try:
imageLocation = comicImage[0].get_attribute('src')
# This is to address an issue in SmackJeeves comics (they append stuff to the image URL)
imageLocation = imageLocation.replace("/dims/optimize","")
# Get the file extension from the URL
URLpath = urlparse(imageLocation).path
ext = os.path.splitext(URLpath)[1]
# Get the original image name from the URL
urlParts = os.path.splitext(URLpath)[0].split("/")
originalImageName = urlParts[len(urlParts)-1] # (It's the last part when split on '/')
except:
# If we simply can't find the image, set the image-related variables to default values
print(" No image found on this page!")
imageLocation = ""
originalImageName = "EmptyFileName"
ext = ""
# Attempt to get the title text for text/link/datetime attributes of provides object (if unavailable, use originalImageName)
imageTitleText = ""
try:
if (imageTitle[0].get_attribute('datetime') != ""):
imageTitleText = imageTitle[0].get_attribute('datetime')
except:
pass
if (imageTitleText == "") or (imageTitleText is None):
try:
if (imageTitle[0].text != ""):
imageTitleText = imageTitle[0].text
except:
pass
if (imageTitleText == "") or (imageTitleText is None):
try:
if (imageTitle[0].get_attribute('href') != ""):
imageTitleText = imageTitle[0].get_attribute('href')
except:
pass
# If none of the above worked, fall back to image filename- tell this to the user if the requested the title
if ((imageTitleText == "") or (imageTitleText is None)) and (imageNameType == "title"):
print(" Image title not found, substituting original image filename")
imageTitleText = originalImageName
# Name the output image file based on the imageNameType toggle
if imageNameType == "title":
# Set image save name to be the comic title
imgSaveName = imageTitleText
elif imageNameType == "originalFilename":
# Set image save name to original image name (based on URL)
imgSaveName = originalImageName
elif imageNameType == "both":
# Set image save name to both!
imgSaveName = imageTitleText + " - " + originalImageName
# Lets quickly remove any characters from that title that may cause issues later
imgSaveName = sanitizeString(imgSaveName)
print(" Comic Title: " + imgSaveName)
# Prior to building the file paths (and resetting it), lets make note of the name of the previous file
# If it exists, make a nice link to it to use when we make the HTML page later, if not, indicate we are at the current page
if txtSavePathName == "":
nextPageHTML = "CURRENT"
else:
nextPageHTML = "<a href=\"" + txtSavePathName + "\">Next</a>"
# Build the final file path for the image using the output dir, the image name, and the image extension
if getImage == "True":
# Double check that we actually found an image in the page
if imageLocation != "":
imgSavePathFull = os.path.join(outputPath, imgSaveName + ext)
else:
# If we didn't find an image, build an informative save path (to make a blank file)
imgSavePathFull = os.path.join(outputPath, imgSaveName + ".missing")
# Build a similar file path for the text content
txtSavePathFull = os.path.join(outputPath, imgSaveName + ".html")
txtSavePathName = imgSaveName + ".html"
# check against the cached imageLocation to see if we successfully found a new image. If not, break
if mostRecentImageURL == imageLocation:
print("\nScript failed to find images twice in a row. We've hit an error!")
break
# If the user requested we get the comic image
if (getImage == "True") and (imageLocation != ""):
print(" Saving: " + imageLocation + "\n To path: " + imgSavePathFull)
# If the image file we are about to write doesn't already exist...
if not os.path.isfile(imgSavePathFull):
# Since we will try to save the image, reset the consecutiveSkipCount to zero
consecutiveSkipCount = 0
# Write out the image file with the imgSavePathFull we built and the imageLocationn we found
with open(imgSavePathFull, 'wb') as workingFile:
response = requests.get(imageLocation, stream=True)
# If the image response is fine (200), download it
if response.ok:
for block in response.iter_content(1024):
if not block:
break
workingFile.write(block)
workingFile.close()
print(" Image saved.")
# Alternate download method to try if the image response itself also yields a 406 error
elif response.status_code == 406:
r = urllib.request.urlopen(imageLocation)
workingFile.write(r.read())
print(" Image saved. (despite 406 response)")
else:
print (" Error saving image: " + response.text)
else:
print(" Found existing Image. skipped.")
consecutiveSkipCount = consecutiveSkipCount + 1
elif (imageLocation == ""):
print(" Placeholder file created in place of missing image!")
open(imgSavePathFull, 'a').close()
# Clear out javascript warning from comment, if it is present
comicCommentHTML = comicCommentHTML.replace("<noscript>Javascript is required to view this site. Please enable Javascript in your browser and reload this page.</noscript>","")
# If the user has requested we build an nice HTML page for the output
if (buildHTML == "True"):
# Build the html string with the image, a link to the page online, the author comment if requested, and a link to the next page (previous page scraped)
htmlStyle = "<style>body {background-color: #cccccc}</style>"
htmlNav = "<a href=\"" + currentPageURL + "\">" + imgSaveName + "</a> | " + nextPageHTML
htmlImg = "<img id=\"comicImage\" src=\"\">"
htmlTableStart = "<table width=\"70%\" style=\"margin-left:auto;margin-right:auto;\"><tr><td>"
htmlTableEnd = "</td></tr></table>"
htmlScipt1 = "<script>"
htmlScipt2 = "nameStr = window.location.href;"
htmlScipt3 = "nameStr = nameStr.split('/');"
htmlScipt4 = "nameStr = nameStr.pop();"
htmlScipt5 = "nameStr = nameStr.replace(\".html\",\"\")"
htmlScipt9 = "nameStr = decodeURI(nameStr)"
htmlScipt6 = "nameStr = nameStr + \"" + ext + "\""
htmlScipt7 = "document.getElementById(\"comicImage\").src = nameStr;"
htmlScipt8 = "</script>"
# Combine html parts to make full string
textStr = htmlStyle +"\n"+ "<center>" +"\n"+ htmlNav +"\n"+ "<br>" +"\n"+ htmlImg +"\n"+ "<br>" +"\n"+ htmlNav +"\n"+ "<br></center>" +"\n"+ htmlTableStart +"\n"+ comicCommentHTML +"\n"+ htmlTableEnd +"\n"+ htmlScipt1 +"\n"+ htmlScipt2 +"\n"+ htmlScipt3 +"\n"+ htmlScipt4 +"\n"+ htmlScipt5 +"\n"+ htmlScipt9 +"\n"+ htmlScipt6 +"\n"+ htmlScipt7 +"\n"+ htmlScipt8
# If the file exists, and we haven't overwritten one yet, continue (This makes sure the latest page has a valid "Next" link)
if (os.path.isfile(txtSavePathFull) and (overwriteCount < 1)):
# Write out a txt file with the comic title and author comment (and source URL) to the txtSavePathFull we built
with open(txtSavePathFull, 'w', encoding="utf-8") as workingFile:
workingFile.write(textStr)
workingFile.close()
print(" Found existing page. Overwritten to create link to new pages.")
overwriteCount += 1
elif not os.path.isfile(txtSavePathFull):
# Write out a txt file with the comic title and author comment (and source URL) to the txtSavePathFull we built
with open(txtSavePathFull, 'w', encoding="utf-8") as workingFile:
workingFile.write(textStr)
workingFile.close()
print(" Page saved.")
elif os.path.isfile(txtSavePathFull):
print(" Found existing page. Skipped.")
# Time to try and navigate to the next page!
# Get the element that is the next button
nextButton = driver.find_elements_by_xpath(nextButtonPath)
# If we found no matching (nextbutton) elements, break out of the loop
if len(nextButton) < 1:
print("\nNo next/previous page button found on this page.\nWe've likely hit the end!")
break
# Record the current URL to check against after we attempt to move to the next page (helps detect loops)
seenURLs.insert(len(seenURLs),driver.current_url)
mostRecentImageURL = imageLocation
# If the nextButtonType is a basic link, parse it out of the element and attempt to navigate there
if nextButtonType == "link":
nextButtonURL = nextButton[0].get_attribute('href')
currentPageURL = nextButtonURL
# First try requesting the webpage and make sure it returns a good status code
# If the attempt itself fails, return that error
pageStatus = validLinkCheck(nextButtonURL)
# If the request was good, let the user know, if it was anything else, pass that error to the user and break
if (pageStatus == "Good"):
# Nice! The page exists and returns a good code
print('\nAccessing page: ' + nextButtonURL)
else:
# If we can't find the current page (or the attempt itself failed), let the user know and break out of the while loop
print('\nNext page unavailable: ' + str(nextButtonURL) + "\n Request yielded: " + pageStatus)
break
# If we didn't break, let's open it in Gecko to start our parsing
driver.get(nextButtonURL)
# If the nextButtonType is javaClick, instead of parsing the next object, just click it
elif nextButtonType == "javaClick":
javaNextButton = nextButton[0].click()
print('\nAccessing page: ' + driver.current_url)
# This sleep alleviates a scenario where the javaClick could cycle a page
# without giving the page time to actually respond,
# resulting in an infinite cycle of incrementing, blanks pages. ().o
time.sleep(1.5)
# Check against the cached urls to see if we successfully advanced a page. If not, break
if driver.current_url in seenURLs:
print("\nScript not successfully advancing pages (looping).\nWe've likely hit an error or the end/beginning!")
break
# Check how many images we've skipped. If it's over 5, break
if (consecutiveSkipCount >= 5):
print("\nScript has skipped 5 consecutive image downloads.\nWe've likely reached existing archived content!")
break
# Close browser
driver.close()
print("\nAll available pages scraped. Exiting.")