Working on #150

Fixes #109 Closing #152 Closing #153 Closing #154 Working on #150 Thanks to: @elrob, @McPatate and @cgimenes
lorenzodifuccia · Nov 21, 2019 · 32bbe89 · 32bbe89
1 parent 31d7dc9
commit 32bbe89
Showing 1 changed file with 86 additions and 78 deletions.
diff --git a/safaribooks.py b/safaribooks.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # coding: utf-8
+import re
 import os
 import sys
 import json
@@ -9,11 +10,11 @@
 import argparse
 import requests
 import traceback
-from lxml import html, etree
 from html import escape
 from random import random
+from lxml import html, etree
 from multiprocessing import Process, Queue, Value
-from urllib.parse import urljoin, urlsplit, urlparse
+from urllib.parse import urljoin, urlparse, parse_qs, quote_plus
 
 
 PATH = os.path.dirname(os.path.realpath(__file__))
@@ -27,6 +28,7 @@
 ORLY_BASE_URL = "https://www." + ORLY_BASE_HOST
 SAFARI_BASE_URL = "https://" + SAFARI_BASE_HOST
 API_ORIGIN_URL = "https://" + API_ORIGIN_HOST
+PROFILE_URL = SAFARI_BASE_URL + "/profile/"
 
 
 class Display:
@@ -75,7 +77,11 @@ def unregister(self):
         sys.excepthook = sys.__excepthook__
 
     def log(self, message):
-        self.logger.info(str(message))  # TODO: "utf-8", "replace"
+        try:
+            self.logger.info(str(message, "utf-8", "replace"))
+
+        except (UnicodeDecodeError, Exception):
+            self.logger.info(message)
 
     def out(self, put):
         pattern = "\r{!s}\r{!s}\n"
@@ -195,7 +201,7 @@ def api_error(response):
 
         else:
             os.remove(COOKIES_FILE)
-            message += "Out-of-Session%s.\n" % (" (%s)" % response["detail"]) if "detail" in response else "" +\
+            message += "Out-of-Session%s.\n" % (" (%s)" % response["detail"]) if "detail" in response else "" + \
                        Display.SH_YELLOW + "[+]" + Display.SH_DEFAULT + \
                        " Use the `--cred` or `--login` options in order to perform the auth login to Safari."
 
@@ -216,20 +222,6 @@ class SafariBooks:
 
     API_TEMPLATE = SAFARI_BASE_URL + "/api/v1/book/{0}/"
 
-    HEADERS = {
-        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
-        "accept-encoding": "gzip, deflate",
-        "accept-language": "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7",
-        "cache-control": "no-cache",
-        "cookie": "",
-        "pragma": "no-cache",
-        "origin": SAFARI_BASE_URL,
-        "referer": LOGIN_ENTRY_URL,
-        "upgrade-insecure-requests": "1",
-        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
-                      "Chrome/60.0.3112.113 Safari/537.36"
-    }
-
     BASE_01_HTML = "<!DOCTYPE html>\n" \
                    "<html lang=\"en\" xml:lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"" \
                    " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"" \
@@ -263,7 +255,7 @@ class SafariBooks:
     CONTENT_OPF = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" \
                   "<package xmlns=\"http://www.idpf.org/2007/opf\" unique-identifier=\"bookid\" version=\"2.0\" >\n" \
                   "<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\" " \
-                  " xmlns:opf=\"http://www.idpf.org/2007/opf\">\n"\
+                  " xmlns:opf=\"http://www.idpf.org/2007/opf\">\n" \
                   "<dc:title>{1}</dc:title>\n" \
                   "{2}\n" \
                   "<dc:description>{3}</dc:description>\n" \
@@ -299,26 +291,42 @@ class SafariBooks:
               "<navMap>{4}</navMap>\n" \
               "</ncx>"
 
+    HEADERS = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+        "accept-encoding": "gzip, deflate",
+        "origin": SAFARI_BASE_URL,
+        "referer": LOGIN_ENTRY_URL,
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                      "Chrome/60.0.3112.113 Safari/537.36"
+    }
+
+    COOKIE_FLOAT_MAX_AGE_PATTERN = re.compile(r'(max-age=\d*\.\d*)', re.IGNORECASE)
+
     def __init__(self, args):
         self.args = args
         self.display = Display("info_%s.log" % escape(args.bookid))
         self.display.intro()
 
-        self.cookies = {}
+        self.session = requests.Session()
+        self.session.headers.update(self.HEADERS)
+
         self.jwt = {}
 
         if not args.cred:
             if not os.path.isfile(COOKIES_FILE):
                 self.display.exit("Login: unable to find cookies file.\n"
                                   "    Please use the `--cred` or `--login` options to perform the login.")
 
-            self.cookies = json.load(open(COOKIES_FILE))
+            self.session.cookies.update(json.load(open(COOKIES_FILE)))
 
         else:
             self.display.info("Logging into Safari Books Online...", state=True)
             self.do_login(*args.cred)
             if not args.no_cookies:
-                json.dump(self.cookies, open(COOKIES_FILE, "w"))
+                json.dump(self.session.cookies.get_dict(), open(COOKIES_FILE, 'w'))
+
+        self.check_login()
 
         self.book_id = args.bookid
         self.api_url = self.API_TEMPLATE.format(self.book_id)
@@ -386,45 +394,32 @@ def __init__(self, args):
         self.create_epub()
 
         if not args.no_cookies:
-            json.dump(self.cookies, open(COOKIES_FILE, "w"))
+            json.dump(self.session.cookies.get_dict(), open(COOKIES_FILE, "w"))
 
         self.display.done(os.path.join(self.BOOK_PATH, self.book_id + ".epub"))
         self.display.unregister()
 
         if not self.display.in_error and not args.log:
             os.remove(self.display.log_file)
 
-    def return_cookies(self):
-        return " ".join(["{0}={1};".format(k, v) for k, v in self.cookies.items()])
-
-    def return_headers(self, url):
-        if ORLY_BASE_HOST in urlsplit(url).netloc:
-            self.HEADERS["cookie"] = self.return_cookies()
-
-        else:
-            self.HEADERS["cookie"] = ""
-
-        return self.HEADERS
-
-    def update_cookies(self, jar):
-        for cookie in jar:
-            if cookie.name != 'sessionid':  # TODO
-                self.cookies.update({
-                    cookie.name: cookie.value
-                })
+    def handle_cookie_update(self, set_cookie_headers):
+        for morsel in set_cookie_headers:
+            # Handle Float 'max-age' Cookie
+            if self.COOKIE_FLOAT_MAX_AGE_PATTERN.search(morsel):
+                cookie_key, cookie_value = morsel.split(";")[0].split("=")
+                self.session.cookies.set(cookie_key, cookie_value)
 
-    def requests_provider(
-            self, url, post=False, data=None, perfom_redirect=True, update_cookies=True, update_referer=True, **kwargs
-    ):
+    def requests_provider(self, url, is_post=False, data=None, perform_redirect=True, **kwargs):
         try:
-            response = getattr(requests, "post" if post else "get")(
+            response = getattr(self.session, "post" if is_post else "get")(
                 url,
-                headers=self.return_headers(url),
                 data=data,
                 allow_redirects=False,
                 **kwargs
             )
 
+            self.handle_cookie_update(response.raw.headers.getlist("Set-Cookie"))
+
             self.display.last_request = (
                 url, data, kwargs, response.status_code, "\n".join(
                     ["\t{}: {}".format(*h) for h in response.headers.items()]
@@ -435,16 +430,8 @@ def requests_provider(
             self.display.error(str(request_exception))
             return 0
 
-        if update_cookies:
-            self.update_cookies(response.cookies)
-
-        if update_referer:
-            # TODO Update Referer HTTP Header
-            # TODO How about Origin?
-            self.HEADERS["referer"] = response.request.url
-
-        if response.is_redirect and perfom_redirect:
-            return self.requests_provider(response.next.url, post, None, perfom_redirect, update_cookies, update_referer)
+        if response.is_redirect and perform_redirect:
+            return self.requests_provider(response.next.url, is_post, None, perform_redirect)
             # TODO How about **kwargs?
 
         return response
@@ -468,19 +455,24 @@ def do_login(self, email, password):
         if response == 0:
             self.display.exit("Login: unable to reach Safari Books Online. Try again...")
 
-        redirect_uri = response.request.path_url[response.request.path_url.index("redirect_uri"):]  # TODO try...catch
-        redirect_uri = redirect_uri[:redirect_uri.index("&")]
-        redirect_uri = "https://api.oreilly.com%2Fapi%2Fv1%2Fauth%2Fopenid%2Fauthorize%3F" + redirect_uri
+        next_parameter = None
+        try:
+            next_parameter = parse_qs(urlparse(response.request.url).query)["next"][0]
+
+        except (AttributeError, ValueError, IndexError):
+            self.display.exit("Login: unable to complete login on Safari Books Online. Try again...")
+
+        redirect_uri = API_ORIGIN_URL + quote_plus(next_parameter)
 
         response = self.requests_provider(
             self.LOGIN_URL,
-            post=True,
+            is_post=True,
             json={
                 "email": email,
                 "password": password,
                 "redirect_uri": redirect_uri
             },
-            perfom_redirect=False
+            perform_redirect=False
         )
 
         if response == 0:
@@ -492,11 +484,14 @@ def do_login(self, email, password):
                 errors_message = error_page.xpath("//ul[@class='errorlist']//li/text()")
                 recaptcha = error_page.xpath("//div[@class='g-recaptcha']")
                 messages = (["    `%s`" % error for error in errors_message
-                            if "password" in error or "email" in error] if len(errors_message) else []) +\
-                           (["    `ReCaptcha required (wait or do logout from the website).`"] if len(recaptcha) else[])
-                self.display.exit("Login: unable to perform auth login to Safari Books Online.\n" +
-                                  self.display.SH_YELLOW + "[*]" + self.display.SH_DEFAULT + " Details:\n"
-                                  "%s" % "\n".join(messages if len(messages) else ["    Unexpected error!"]))
+                             if "password" in error or "email" in error] if len(errors_message) else []) + \
+                           (["    `ReCaptcha required (wait or do logout from the website).`"] if len(
+                               recaptcha) else [])
+                self.display.exit(
+                    "Login: unable to perform auth login to Safari Books Online.\n" + self.display.SH_YELLOW +
+                    "[*]" + self.display.SH_DEFAULT + " Details:\n" + "%s" % "\n".join(
+                        messages if len(messages) else ["    Unexpected error!"])
+                )
             except (html.etree.ParseError, html.etree.ParserError) as parsing_error:
                 self.display.error(parsing_error)
                 self.display.exit(
@@ -509,6 +504,17 @@ def do_login(self, email, password):
         if response == 0:
             self.display.exit("Login: unable to reach Safari Books Online. Try again...")
 
+    def check_login(self):
+        response = self.requests_provider(PROFILE_URL, perform_redirect=False)
+
+        if response == 0:
+            self.display.exit("Login: unable to reach Safari Books Online. Try again...")
+
+        if response.status_code != 200:
+            self.display.exit("Authentication issue: unable to access profile page.")
+
+        self.display.info("Successfully authenticated.", state=True)
+
     def get_book_info(self):
         response = self.requests_provider(self.api_url)
         if response == 0:
@@ -548,7 +554,7 @@ def get_book_chapters(self, page=1):
         return result + (self.get_book_chapters(page + 1) if response["next"] else [])
 
     def get_default_cover(self):
-        response = self.requests_provider(self.book_info["cover"], update_cookies=False, stream=True)
+        response = self.requests_provider(self.book_info["cover"], stream=True)
         if response == 0:
             self.display.error("Error trying to retrieve the cover: %s" % self.book_info["cover"])
             return False
@@ -765,7 +771,7 @@ def create_dirs(self):
 
     def save_page_html(self, contents):
         self.filename = self.filename.replace(".html", ".xhtml")
-        open(os.path.join(self.BOOK_PATH, "OEBPS", self.filename), "wb")\
+        open(os.path.join(self.BOOK_PATH, "OEBPS", self.filename), "wb") \
             .write(self.BASE_HTML.format(contents[0], contents[1]).encode("utf-8", 'xmlcharrefreplace'))
         self.display.log("Created: %s" % self.filename)
 
@@ -815,7 +821,7 @@ def _thread_download_css(self, url):
                 self.display.css_ad_info.value = 1
 
         else:
-            response = self.requests_provider(url, update_cookies=False)
+            response = self.requests_provider(url)
             if response == 0:
                 self.display.error("Error trying to retrieve this CSS: %s\n    From: %s" % (css_file, url))
 
@@ -838,9 +844,7 @@ def _thread_download_images(self, url):
                 self.display.images_ad_info.value = 1
 
         else:
-            response = self.requests_provider(urljoin(SAFARI_BASE_URL, url),
-                                              update_cookies=False,
-                                              stream=True)
+            response = self.requests_provider(urljoin(SAFARI_BASE_URL, url), stream=True)
             if response == 0:
                 self.display.error("Error trying to retrieve this image: %s\n    From: %s" % (image_name, url))
 
@@ -854,7 +858,7 @@ def _thread_download_images(self, url):
     def _start_multiprocessing(self, operation, full_queue):
         if len(full_queue) > 5:
             for i in range(0, len(full_queue), 5):
-                self._start_multiprocessing(operation, full_queue[i:i+5])
+                self._start_multiprocessing(operation, full_queue[i:i + 5])
 
         else:
             process_queue = [Process(target=operation, args=(arg,)) for arg in full_queue]
@@ -879,7 +883,8 @@ def collect_images(self):
         if self.display.book_ad_info == 2:
             self.display.info("Some of the book contents were already downloaded.\n"
                               "    If you want to be sure that all the images will be downloaded,\n"
-                              "    please delete the output direcotry '" + self.BOOK_PATH + "' and restart the program.")
+                              "    please delete the output direcotry '" + self.BOOK_PATH +
+                              "' and restart the program.")
 
         self.display.state_status.value = -1
 
@@ -1056,21 +1061,23 @@ def create_epub(self):
     args_parsed = arguments.parse_args()
 
     if args_parsed.cred or args_parsed.login:
-        email = ""
+        user_email = ""
         pre_cred = ""
 
         if args_parsed.cred:
             pre_cred = args_parsed.cred
 
         else:
-            email = input("Email: ")
+            user_email = input("Email: ")
             passwd = getpass.getpass("Password: ")
-            pre_cred = email + ":" + passwd
+            pre_cred = user_email + ":" + passwd
 
         parsed_cred = SafariBooks.parse_cred(pre_cred)
 
         if not parsed_cred:
-            arguments.error("invalid credential: %s" % (args_parsed.cred if args_parsed.cred else (email + ":*******")))
+            arguments.error("invalid credential: %s" % (
+                args_parsed.cred if args_parsed.cred else (user_email + ":*******")
+            ))
 
         args_parsed.cred = parsed_cred
 
@@ -1079,4 +1086,5 @@ def create_epub(self):
             arguments.error("invalid option: `--no-cookies` is valid only if you use the `--cred` option")
 
     SafariBooks(args_parsed)
+    # Hint: do you want to download more then one book once, initialized more than one instance of `SafariBooks`...
     sys.exit(0)