From 32bbe893e62a9ba7053297eb496d262e14fd4b89 Mon Sep 17 00:00:00 2001 From: Lorenzo Di Fuccia Date: Fri, 22 Nov 2019 00:04:35 +0100 Subject: [PATCH] Working on #150 Fixes #109 Closing #152 Closing #153 Closing #154 Working on #150 Thanks to: @elrob, @McPatate and @cgimenes --- safaribooks.py | 164 ++++++++++++++++++++++++++----------------------- 1 file changed, 86 insertions(+), 78 deletions(-) diff --git a/safaribooks.py b/safaribooks.py index 38873a0..8647477 100644 --- a/safaribooks.py +++ b/safaribooks.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # coding: utf-8 +import re import os import sys import json @@ -9,11 +10,11 @@ import argparse import requests import traceback -from lxml import html, etree from html import escape from random import random +from lxml import html, etree from multiprocessing import Process, Queue, Value -from urllib.parse import urljoin, urlsplit, urlparse +from urllib.parse import urljoin, urlparse, parse_qs, quote_plus PATH = os.path.dirname(os.path.realpath(__file__)) @@ -27,6 +28,7 @@ ORLY_BASE_URL = "https://www." + ORLY_BASE_HOST SAFARI_BASE_URL = "https://" + SAFARI_BASE_HOST API_ORIGIN_URL = "https://" + API_ORIGIN_HOST +PROFILE_URL = SAFARI_BASE_URL + "/profile/" class Display: @@ -75,7 +77,11 @@ def unregister(self): sys.excepthook = sys.__excepthook__ def log(self, message): - self.logger.info(str(message)) # TODO: "utf-8", "replace" + try: + self.logger.info(str(message, "utf-8", "replace")) + + except (UnicodeDecodeError, Exception): + self.logger.info(message) def out(self, put): pattern = "\r{!s}\r{!s}\n" @@ -195,7 +201,7 @@ def api_error(response): else: os.remove(COOKIES_FILE) - message += "Out-of-Session%s.\n" % (" (%s)" % response["detail"]) if "detail" in response else "" +\ + message += "Out-of-Session%s.\n" % (" (%s)" % response["detail"]) if "detail" in response else "" + \ Display.SH_YELLOW + "[+]" + Display.SH_DEFAULT + \ " Use the `--cred` or `--login` options in order to perform the auth login to Safari." @@ -216,20 +222,6 @@ class SafariBooks: API_TEMPLATE = SAFARI_BASE_URL + "/api/v1/book/{0}/" - HEADERS = { - "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", - "accept-encoding": "gzip, deflate", - "accept-language": "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7", - "cache-control": "no-cache", - "cookie": "", - "pragma": "no-cache", - "origin": SAFARI_BASE_URL, - "referer": LOGIN_ENTRY_URL, - "upgrade-insecure-requests": "1", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/60.0.3112.113 Safari/537.36" - } - BASE_01_HTML = "\n" \ "\n" \ "\n" \ "\n"\ + " xmlns:opf=\"http://www.idpf.org/2007/opf\">\n" \ "{1}\n" \ "{2}\n" \ "{3}\n" \ @@ -299,12 +291,26 @@ class SafariBooks: "{4}\n" \ "" + HEADERS = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + "accept-encoding": "gzip, deflate", + "origin": SAFARI_BASE_URL, + "referer": LOGIN_ENTRY_URL, + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/60.0.3112.113 Safari/537.36" + } + + COOKIE_FLOAT_MAX_AGE_PATTERN = re.compile(r'(max-age=\d*\.\d*)', re.IGNORECASE) + def __init__(self, args): self.args = args self.display = Display("info_%s.log" % escape(args.bookid)) self.display.intro() - self.cookies = {} + self.session = requests.Session() + self.session.headers.update(self.HEADERS) + self.jwt = {} if not args.cred: @@ -312,13 +318,15 @@ def __init__(self, args): self.display.exit("Login: unable to find cookies file.\n" " Please use the `--cred` or `--login` options to perform the login.") - self.cookies = json.load(open(COOKIES_FILE)) + self.session.cookies.update(json.load(open(COOKIES_FILE))) else: self.display.info("Logging into Safari Books Online...", state=True) self.do_login(*args.cred) if not args.no_cookies: - json.dump(self.cookies, open(COOKIES_FILE, "w")) + json.dump(self.session.cookies.get_dict(), open(COOKIES_FILE, 'w')) + + self.check_login() self.book_id = args.bookid self.api_url = self.API_TEMPLATE.format(self.book_id) @@ -386,7 +394,7 @@ def __init__(self, args): self.create_epub() if not args.no_cookies: - json.dump(self.cookies, open(COOKIES_FILE, "w")) + json.dump(self.session.cookies.get_dict(), open(COOKIES_FILE, "w")) self.display.done(os.path.join(self.BOOK_PATH, self.book_id + ".epub")) self.display.unregister() @@ -394,37 +402,24 @@ def __init__(self, args): if not self.display.in_error and not args.log: os.remove(self.display.log_file) - def return_cookies(self): - return " ".join(["{0}={1};".format(k, v) for k, v in self.cookies.items()]) - - def return_headers(self, url): - if ORLY_BASE_HOST in urlsplit(url).netloc: - self.HEADERS["cookie"] = self.return_cookies() - - else: - self.HEADERS["cookie"] = "" - - return self.HEADERS - - def update_cookies(self, jar): - for cookie in jar: - if cookie.name != 'sessionid': # TODO - self.cookies.update({ - cookie.name: cookie.value - }) + def handle_cookie_update(self, set_cookie_headers): + for morsel in set_cookie_headers: + # Handle Float 'max-age' Cookie + if self.COOKIE_FLOAT_MAX_AGE_PATTERN.search(morsel): + cookie_key, cookie_value = morsel.split(";")[0].split("=") + self.session.cookies.set(cookie_key, cookie_value) - def requests_provider( - self, url, post=False, data=None, perfom_redirect=True, update_cookies=True, update_referer=True, **kwargs - ): + def requests_provider(self, url, is_post=False, data=None, perform_redirect=True, **kwargs): try: - response = getattr(requests, "post" if post else "get")( + response = getattr(self.session, "post" if is_post else "get")( url, - headers=self.return_headers(url), data=data, allow_redirects=False, **kwargs ) + self.handle_cookie_update(response.raw.headers.getlist("Set-Cookie")) + self.display.last_request = ( url, data, kwargs, response.status_code, "\n".join( ["\t{}: {}".format(*h) for h in response.headers.items()] @@ -435,16 +430,8 @@ def requests_provider( self.display.error(str(request_exception)) return 0 - if update_cookies: - self.update_cookies(response.cookies) - - if update_referer: - # TODO Update Referer HTTP Header - # TODO How about Origin? - self.HEADERS["referer"] = response.request.url - - if response.is_redirect and perfom_redirect: - return self.requests_provider(response.next.url, post, None, perfom_redirect, update_cookies, update_referer) + if response.is_redirect and perform_redirect: + return self.requests_provider(response.next.url, is_post, None, perform_redirect) # TODO How about **kwargs? return response @@ -468,19 +455,24 @@ def do_login(self, email, password): if response == 0: self.display.exit("Login: unable to reach Safari Books Online. Try again...") - redirect_uri = response.request.path_url[response.request.path_url.index("redirect_uri"):] # TODO try...catch - redirect_uri = redirect_uri[:redirect_uri.index("&")] - redirect_uri = "https://api.oreilly.com%2Fapi%2Fv1%2Fauth%2Fopenid%2Fauthorize%3F" + redirect_uri + next_parameter = None + try: + next_parameter = parse_qs(urlparse(response.request.url).query)["next"][0] + + except (AttributeError, ValueError, IndexError): + self.display.exit("Login: unable to complete login on Safari Books Online. Try again...") + + redirect_uri = API_ORIGIN_URL + quote_plus(next_parameter) response = self.requests_provider( self.LOGIN_URL, - post=True, + is_post=True, json={ "email": email, "password": password, "redirect_uri": redirect_uri }, - perfom_redirect=False + perform_redirect=False ) if response == 0: @@ -492,11 +484,14 @@ def do_login(self, email, password): errors_message = error_page.xpath("//ul[@class='errorlist']//li/text()") recaptcha = error_page.xpath("//div[@class='g-recaptcha']") messages = ([" `%s`" % error for error in errors_message - if "password" in error or "email" in error] if len(errors_message) else []) +\ - ([" `ReCaptcha required (wait or do logout from the website).`"] if len(recaptcha) else[]) - self.display.exit("Login: unable to perform auth login to Safari Books Online.\n" + - self.display.SH_YELLOW + "[*]" + self.display.SH_DEFAULT + " Details:\n" - "%s" % "\n".join(messages if len(messages) else [" Unexpected error!"])) + if "password" in error or "email" in error] if len(errors_message) else []) + \ + ([" `ReCaptcha required (wait or do logout from the website).`"] if len( + recaptcha) else []) + self.display.exit( + "Login: unable to perform auth login to Safari Books Online.\n" + self.display.SH_YELLOW + + "[*]" + self.display.SH_DEFAULT + " Details:\n" + "%s" % "\n".join( + messages if len(messages) else [" Unexpected error!"]) + ) except (html.etree.ParseError, html.etree.ParserError) as parsing_error: self.display.error(parsing_error) self.display.exit( @@ -509,6 +504,17 @@ def do_login(self, email, password): if response == 0: self.display.exit("Login: unable to reach Safari Books Online. Try again...") + def check_login(self): + response = self.requests_provider(PROFILE_URL, perform_redirect=False) + + if response == 0: + self.display.exit("Login: unable to reach Safari Books Online. Try again...") + + if response.status_code != 200: + self.display.exit("Authentication issue: unable to access profile page.") + + self.display.info("Successfully authenticated.", state=True) + def get_book_info(self): response = self.requests_provider(self.api_url) if response == 0: @@ -548,7 +554,7 @@ def get_book_chapters(self, page=1): return result + (self.get_book_chapters(page + 1) if response["next"] else []) def get_default_cover(self): - response = self.requests_provider(self.book_info["cover"], update_cookies=False, stream=True) + response = self.requests_provider(self.book_info["cover"], stream=True) if response == 0: self.display.error("Error trying to retrieve the cover: %s" % self.book_info["cover"]) return False @@ -765,7 +771,7 @@ def create_dirs(self): def save_page_html(self, contents): self.filename = self.filename.replace(".html", ".xhtml") - open(os.path.join(self.BOOK_PATH, "OEBPS", self.filename), "wb")\ + open(os.path.join(self.BOOK_PATH, "OEBPS", self.filename), "wb") \ .write(self.BASE_HTML.format(contents[0], contents[1]).encode("utf-8", 'xmlcharrefreplace')) self.display.log("Created: %s" % self.filename) @@ -815,7 +821,7 @@ def _thread_download_css(self, url): self.display.css_ad_info.value = 1 else: - response = self.requests_provider(url, update_cookies=False) + response = self.requests_provider(url) if response == 0: self.display.error("Error trying to retrieve this CSS: %s\n From: %s" % (css_file, url)) @@ -838,9 +844,7 @@ def _thread_download_images(self, url): self.display.images_ad_info.value = 1 else: - response = self.requests_provider(urljoin(SAFARI_BASE_URL, url), - update_cookies=False, - stream=True) + response = self.requests_provider(urljoin(SAFARI_BASE_URL, url), stream=True) if response == 0: self.display.error("Error trying to retrieve this image: %s\n From: %s" % (image_name, url)) @@ -854,7 +858,7 @@ def _thread_download_images(self, url): def _start_multiprocessing(self, operation, full_queue): if len(full_queue) > 5: for i in range(0, len(full_queue), 5): - self._start_multiprocessing(operation, full_queue[i:i+5]) + self._start_multiprocessing(operation, full_queue[i:i + 5]) else: process_queue = [Process(target=operation, args=(arg,)) for arg in full_queue] @@ -879,7 +883,8 @@ def collect_images(self): if self.display.book_ad_info == 2: self.display.info("Some of the book contents were already downloaded.\n" " If you want to be sure that all the images will be downloaded,\n" - " please delete the output direcotry '" + self.BOOK_PATH + "' and restart the program.") + " please delete the output direcotry '" + self.BOOK_PATH + + "' and restart the program.") self.display.state_status.value = -1 @@ -1056,21 +1061,23 @@ def create_epub(self): args_parsed = arguments.parse_args() if args_parsed.cred or args_parsed.login: - email = "" + user_email = "" pre_cred = "" if args_parsed.cred: pre_cred = args_parsed.cred else: - email = input("Email: ") + user_email = input("Email: ") passwd = getpass.getpass("Password: ") - pre_cred = email + ":" + passwd + pre_cred = user_email + ":" + passwd parsed_cred = SafariBooks.parse_cred(pre_cred) if not parsed_cred: - arguments.error("invalid credential: %s" % (args_parsed.cred if args_parsed.cred else (email + ":*******"))) + arguments.error("invalid credential: %s" % ( + args_parsed.cred if args_parsed.cred else (user_email + ":*******") + )) args_parsed.cred = parsed_cred @@ -1079,4 +1086,5 @@ def create_epub(self): arguments.error("invalid option: `--no-cookies` is valid only if you use the `--cred` option") SafariBooks(args_parsed) + # Hint: do you want to download more then one book once, initialized more than one instance of `SafariBooks`... sys.exit(0)