-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathhathitrustPDF.py
executable file
·286 lines (245 loc) · 11.1 KB
/
hathitrustPDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/env python3
import argparse
import os
import re
import threading
import time
from pathlib import Path
import progressbar
import requests
from PyPDF2 import PdfMerger
from bs4 import BeautifulSoup
import re
class Downloader:
def __init__(self, max_threads, to_download, bar_, path, args):
self.max_threads = max_threads
self.download_threads = []
self.finished_threads = []
self.to_download = to_download
self.bar = bar_
self.lock = threading.Lock()
self.path = path
self.retries = args.retries
self.verbose = args.verbose
def download(self):
def download_finished(thread_: DownloadThread): # callback function
self.finished_threads.append(thread_)
self.bar.update(1)
with self.lock:
if len(self.to_download) > 0:
# create new thread with new ID, link, and callback
new_thread = DownloadThread(link_=self.to_download.pop(0), downloader_=self,
callback=download_finished)
self.download_threads.append(new_thread)
new_thread.start()
else:
if len(self.download_threads) == len(self.finished_threads):
if self.verbose:
print("All downloads finished\n")
# Start initial set of threads
for _ in range(min(self.max_threads, len(self.to_download))):
thread = DownloadThread(link_=self.to_download.pop(0), downloader_=self,
callback=download_finished)
self.download_threads.append(thread)
thread.start()
# Wait for all download threads to finish
for thread in self.download_threads:
thread.join()
def download_file(self, link_):
retry_count = 0
page_number = re.search(r"seq=(\d+)", link_)
if page_number is not None:
page_number = page_number.group(1)
else:
page_number = ""
if self.verbose:
print(f"Downloading page {page_number}")
# check if file exists
if os.path.exists(os.path.join(self.path, 'page' + page_number + '.pdf')):
if self.verbose:
print(f"File page{page_number}.pdf already exists. Skipping...")
return
pdf_download = requests.get(link_, stream=True, timeout=300)
if pdf_download.status_code != 200:
while retry_count < self.retries:
if pdf_download.status_code == 200:
break
retry_count += 1
if self.verbose:
print(f"Error downloading {link_}: Status code {pdf_download.status_code}. "
f"Retrying...{retry_count}/{self.retries}")
if pdf_download.status_code == 429:
if self.verbose:
print("Too many requests. Waiting 5 seconds...")
time.sleep(5)
pdf_download = requests.get(link_, stream=True, timeout=300)
if pdf_download.status_code != 200:
if self.verbose:
print(f"Error downloading {link_} with status code {pdf_download.status_code}. Skipping...")
return
path = os.path.join(self.path, 'page' + page_number + '.pdf')
try:
with open(path, 'wb') as f:
f.write(pdf_download.content)
except Exception as e_:
if self.verbose:
print("Error writing file " + path)
print(e_)
return
if self.verbose:
print(f"Finished downloading page {page_number}")
class DownloadThread(threading.Thread):
def __init__(self, link_, downloader_, callback):
threading.Thread.__init__(self)
self.link = link_ # link to download
self.downloader = downloader_ # Downloader object
self.callback = callback # callback function to call when download is finished
def run(self):
self.downloader.download_file(self.link)
self.callback(self)
def check_files_missing(begin, end, path_folder, pdf_list):
missing_pages = []
for page in range(begin, end):
if os.path.join(path_folder, f"page{page}.pdf") not in pdf_list:
missing_pages.append(page)
for file in os.listdir(path_folder):
if os.path.getsize(os.path.join(path_folder, file)) == 0:
missing_pages.append(int(file[4:-4]))
return missing_pages
def main():
parser = argparse.ArgumentParser(description='PDF Downloader and Merger')
parser.add_argument('-l', '--link', help='HathiTrust book link')
parser.add_argument('-i', '--input-file', default="", help='File with list of links formatted as link,output_path')
parser.add_argument('-t', '--thread-count', type=int, default=5, help='Number of download threads')
parser.add_argument('-r', '--retries', type=int, default=3, help='Number of retries for failed downloads')
parser.add_argument('-b', '--begin', type=int, default=1, help='First page to download')
parser.add_argument('-e', '--end', type=int, default=0, help='Last page to download')
parser.add_argument('-k', '--keep', action='store_true', help='Keep downloaded pages')
parser.add_argument('-o', '--output-path', default=None, help='Output file path')
parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose mode')
parser.add_argument('-V', '--version', action='version', version='%(prog)s 1.0')
# todo: add -i option to take a file with a list of link:output
args = parser.parse_args()
if not (args.link or args.input_file):
parser.print_help()
exit(1)
if args.input_file != "":
with open(args.input_file, 'r') as f:
links = dict(map(lambda x: x.split(','), f.readlines()))
else:
if args.output_path is None:
args.output_path = "book.pdf"
links = {args.link: args.output_path}
for link, output in links.items():
download_link(args, link, output.rstrip('\n'))
def download_link(args, link, output):
start_time = time.time()
if "babel.hathitrust.org" in link:
id_book = re.findall(r'id=(\w*\.\w*)|$', link)[0]
elif "hdl.handle.net" in link:
link.rstrip('/')
id_book = re.findall(r'.+/(.+)', link)[0]
else:
print(f"{link}: Unknown link format. Please use a link from babel.hathitrust.org or hdl.handle.net")
return
r = requests.get(link)
soup = BeautifulSoup(r.text, "html.parser")
# Number of the book pages and name
regex = r'HT.params.totalSeq = (\d+)' # https://stackoverflow.com/a/1732454 :)
with open("out.html","w") as f:
f.write(r.text)
match: re.Match = re.search(regex, r.text)
if match is None:
print("Failed to find page count.")
return
pages_book = int(match.group(1))
name_book = soup.find('meta', {'property': 'og:title'})['content']
# Limit book title
if len(name_book) > 55:
name_book = name_book[:40]
# Remove invalid characters
remove_character = "[],/\\:.;\"'?!*"
name_book = name_book.translate(str.maketrans(remove_character, len(remove_character)*" ")).strip()
if args.output_path is None:
args.output_path = name_book + ".pdf"
# Create a new folder
local = os.getcwd()
path_folder = os.path.join(local, "tmp")
Path(path_folder).mkdir(parents=True, exist_ok=True)
# Download pdf file
begin_page = args.begin
last_page = pages_book + 1 if (args.end == 0 or args.end > pages_book + 1) else args.end + 1
# ProgressBar
bar = progressbar.ProgressBar(maxval=last_page,
widgets=[progressbar.Bar('=', '[', ']'), ' ',
progressbar.Percentage()])
bar.start()
base_link = 'https://babel.hathitrust.org/cgi/imgsrv/download/pdf?id={};orient=0;size=100;seq={};attachment=0'
links = [base_link.format(id_book, actual_page) for actual_page in range(begin_page, last_page)]
downloader = Downloader(max_threads=args.thread_count, to_download=links, bar_=bar,
path=path_folder, args=args)
downloader.download()
# Sort by page number, trims "page" and ".pdf"
ordered_files = sorted(os.listdir(path_folder), key=lambda x: int(x[4:-4]))
# Merge PDF files
pdf_list = [os.path.join(path_folder, a) for a in ordered_files if a.endswith(".pdf")]
# print(pdf_list)
missing_pages = check_files_missing(begin_page, last_page, path_folder, pdf_list)
# Check if there are missing pages
while len(missing_pages) > 0:
for page in missing_pages:
print(f"Missing/corrupted page {page}. Download it manually at {base_link.format(id_book, page)}, and "
f"save it to {os.path.join(path_folder, f'page{page}.pdf')}.\n")
print("You have missing pages. Press enter to continue, R to recheck, D to redownload, or CTRL+C to exit.")
try:
# Wait for user input
while True:
key = input()
if key.lower() == "r":
# Recheck missing pages
ordered_files = sorted(os.listdir(path_folder), key=lambda x: int(x[4:-4]))
pdf_list = [os.path.join(path_folder, a) for a in ordered_files if a.endswith(".pdf")]
missing_pages = check_files_missing(begin_page, last_page, path_folder, pdf_list)
break
elif key.lower() == "d":
# Try to download missing pages
for i in missing_pages:
downloader.download_file(base_link.format(id_book, i))
ordered_files = sorted(os.listdir(path_folder), key=lambda x: int(x[4:-4]))
pdf_list = [os.path.join(path_folder, a) for a in ordered_files if a.endswith(".pdf")]
missing_pages = check_files_missing(begin_page, last_page, path_folder, pdf_list)
break
elif key == "":
# force continue, even with missing pages
missing_pages = []
break
except KeyboardInterrupt:
return
merger = PdfMerger()
for pdf in pdf_list:
with open(pdf, 'rb') as file:
merger.append(file)
try:
with open(output, "wb") as fout:
merger.write(fout)
except Exception as e:
print("Error writing merged file " + args.output_path)
print(e)
# Cleanup
if not args.keep:
for i in pdf_list:
try:
os.remove(os.path.join(path_folder, i))
except Exception as e:
print("Error removing file " + i)
print(e)
try:
os.rmdir(path_folder)
except Exception as e:
print("Error removing folder " + path_folder)
print(e)
bar.finish()
merger.close()
print(f"Finished downloading {output}. Took {time.time() - start_time} seconds.")
if __name__ == '__main__':
main()