Skip to content

Commit

Permalink
Working on #150
Browse files Browse the repository at this point in the history
Fixes #109 
Closing #152 
Closing #153
Closing #154
Working on #150 

Thanks to: @elrob, @McPatate and @cgimenes
  • Loading branch information
lorenzodifuccia authored Nov 21, 2019
1 parent 31d7dc9 commit 32bbe89
Showing 1 changed file with 86 additions and 78 deletions.
164 changes: 86 additions & 78 deletions safaribooks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
# coding: utf-8
import re
import os
import sys
import json
Expand All @@ -9,11 +10,11 @@
import argparse
import requests
import traceback
from lxml import html, etree
from html import escape
from random import random
from lxml import html, etree
from multiprocessing import Process, Queue, Value
from urllib.parse import urljoin, urlsplit, urlparse
from urllib.parse import urljoin, urlparse, parse_qs, quote_plus


PATH = os.path.dirname(os.path.realpath(__file__))
Expand All @@ -27,6 +28,7 @@
ORLY_BASE_URL = "https://www." + ORLY_BASE_HOST
SAFARI_BASE_URL = "https://" + SAFARI_BASE_HOST
API_ORIGIN_URL = "https://" + API_ORIGIN_HOST
PROFILE_URL = SAFARI_BASE_URL + "/profile/"


class Display:
Expand Down Expand Up @@ -75,7 +77,11 @@ def unregister(self):
sys.excepthook = sys.__excepthook__

def log(self, message):
self.logger.info(str(message)) # TODO: "utf-8", "replace"
try:
self.logger.info(str(message, "utf-8", "replace"))

except (UnicodeDecodeError, Exception):
self.logger.info(message)

def out(self, put):
pattern = "\r{!s}\r{!s}\n"
Expand Down Expand Up @@ -195,7 +201,7 @@ def api_error(response):

else:
os.remove(COOKIES_FILE)
message += "Out-of-Session%s.\n" % (" (%s)" % response["detail"]) if "detail" in response else "" +\
message += "Out-of-Session%s.\n" % (" (%s)" % response["detail"]) if "detail" in response else "" + \
Display.SH_YELLOW + "[+]" + Display.SH_DEFAULT + \
" Use the `--cred` or `--login` options in order to perform the auth login to Safari."

Expand All @@ -216,20 +222,6 @@ class SafariBooks:

API_TEMPLATE = SAFARI_BASE_URL + "/api/v1/book/{0}/"

HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate",
"accept-language": "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7",
"cache-control": "no-cache",
"cookie": "",
"pragma": "no-cache",
"origin": SAFARI_BASE_URL,
"referer": LOGIN_ENTRY_URL,
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/60.0.3112.113 Safari/537.36"
}

BASE_01_HTML = "<!DOCTYPE html>\n" \
"<html lang=\"en\" xml:lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"" \
" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"" \
Expand Down Expand Up @@ -263,7 +255,7 @@ class SafariBooks:
CONTENT_OPF = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" \
"<package xmlns=\"http://www.idpf.org/2007/opf\" unique-identifier=\"bookid\" version=\"2.0\" >\n" \
"<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\" " \
" xmlns:opf=\"http://www.idpf.org/2007/opf\">\n"\
" xmlns:opf=\"http://www.idpf.org/2007/opf\">\n" \
"<dc:title>{1}</dc:title>\n" \
"{2}\n" \
"<dc:description>{3}</dc:description>\n" \
Expand Down Expand Up @@ -299,26 +291,42 @@ class SafariBooks:
"<navMap>{4}</navMap>\n" \
"</ncx>"

HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate",
"origin": SAFARI_BASE_URL,
"referer": LOGIN_ENTRY_URL,
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/60.0.3112.113 Safari/537.36"
}

COOKIE_FLOAT_MAX_AGE_PATTERN = re.compile(r'(max-age=\d*\.\d*)', re.IGNORECASE)

def __init__(self, args):
self.args = args
self.display = Display("info_%s.log" % escape(args.bookid))
self.display.intro()

self.cookies = {}
self.session = requests.Session()
self.session.headers.update(self.HEADERS)

self.jwt = {}

if not args.cred:
if not os.path.isfile(COOKIES_FILE):
self.display.exit("Login: unable to find cookies file.\n"
" Please use the `--cred` or `--login` options to perform the login.")

self.cookies = json.load(open(COOKIES_FILE))
self.session.cookies.update(json.load(open(COOKIES_FILE)))

else:
self.display.info("Logging into Safari Books Online...", state=True)
self.do_login(*args.cred)
if not args.no_cookies:
json.dump(self.cookies, open(COOKIES_FILE, "w"))
json.dump(self.session.cookies.get_dict(), open(COOKIES_FILE, 'w'))

self.check_login()

self.book_id = args.bookid
self.api_url = self.API_TEMPLATE.format(self.book_id)
Expand Down Expand Up @@ -386,45 +394,32 @@ def __init__(self, args):
self.create_epub()

if not args.no_cookies:
json.dump(self.cookies, open(COOKIES_FILE, "w"))
json.dump(self.session.cookies.get_dict(), open(COOKIES_FILE, "w"))

self.display.done(os.path.join(self.BOOK_PATH, self.book_id + ".epub"))
self.display.unregister()

if not self.display.in_error and not args.log:
os.remove(self.display.log_file)

def return_cookies(self):
return " ".join(["{0}={1};".format(k, v) for k, v in self.cookies.items()])

def return_headers(self, url):
if ORLY_BASE_HOST in urlsplit(url).netloc:
self.HEADERS["cookie"] = self.return_cookies()

else:
self.HEADERS["cookie"] = ""

return self.HEADERS

def update_cookies(self, jar):
for cookie in jar:
if cookie.name != 'sessionid': # TODO
self.cookies.update({
cookie.name: cookie.value
})
def handle_cookie_update(self, set_cookie_headers):
for morsel in set_cookie_headers:
# Handle Float 'max-age' Cookie
if self.COOKIE_FLOAT_MAX_AGE_PATTERN.search(morsel):
cookie_key, cookie_value = morsel.split(";")[0].split("=")
self.session.cookies.set(cookie_key, cookie_value)

def requests_provider(
self, url, post=False, data=None, perfom_redirect=True, update_cookies=True, update_referer=True, **kwargs
):
def requests_provider(self, url, is_post=False, data=None, perform_redirect=True, **kwargs):
try:
response = getattr(requests, "post" if post else "get")(
response = getattr(self.session, "post" if is_post else "get")(
url,
headers=self.return_headers(url),
data=data,
allow_redirects=False,
**kwargs
)

self.handle_cookie_update(response.raw.headers.getlist("Set-Cookie"))

self.display.last_request = (
url, data, kwargs, response.status_code, "\n".join(
["\t{}: {}".format(*h) for h in response.headers.items()]
Expand All @@ -435,16 +430,8 @@ def requests_provider(
self.display.error(str(request_exception))
return 0

if update_cookies:
self.update_cookies(response.cookies)

if update_referer:
# TODO Update Referer HTTP Header
# TODO How about Origin?
self.HEADERS["referer"] = response.request.url

if response.is_redirect and perfom_redirect:
return self.requests_provider(response.next.url, post, None, perfom_redirect, update_cookies, update_referer)
if response.is_redirect and perform_redirect:
return self.requests_provider(response.next.url, is_post, None, perform_redirect)
# TODO How about **kwargs?

return response
Expand All @@ -468,19 +455,24 @@ def do_login(self, email, password):
if response == 0:
self.display.exit("Login: unable to reach Safari Books Online. Try again...")

redirect_uri = response.request.path_url[response.request.path_url.index("redirect_uri"):] # TODO try...catch
redirect_uri = redirect_uri[:redirect_uri.index("&")]
redirect_uri = "https://api.oreilly.com%2Fapi%2Fv1%2Fauth%2Fopenid%2Fauthorize%3F" + redirect_uri
next_parameter = None
try:
next_parameter = parse_qs(urlparse(response.request.url).query)["next"][0]

except (AttributeError, ValueError, IndexError):
self.display.exit("Login: unable to complete login on Safari Books Online. Try again...")

redirect_uri = API_ORIGIN_URL + quote_plus(next_parameter)

response = self.requests_provider(
self.LOGIN_URL,
post=True,
is_post=True,
json={
"email": email,
"password": password,
"redirect_uri": redirect_uri
},
perfom_redirect=False
perform_redirect=False
)

if response == 0:
Expand All @@ -492,11 +484,14 @@ def do_login(self, email, password):
errors_message = error_page.xpath("//ul[@class='errorlist']//li/text()")
recaptcha = error_page.xpath("//div[@class='g-recaptcha']")
messages = ([" `%s`" % error for error in errors_message
if "password" in error or "email" in error] if len(errors_message) else []) +\
([" `ReCaptcha required (wait or do logout from the website).`"] if len(recaptcha) else[])
self.display.exit("Login: unable to perform auth login to Safari Books Online.\n" +
self.display.SH_YELLOW + "[*]" + self.display.SH_DEFAULT + " Details:\n"
"%s" % "\n".join(messages if len(messages) else [" Unexpected error!"]))
if "password" in error or "email" in error] if len(errors_message) else []) + \
([" `ReCaptcha required (wait or do logout from the website).`"] if len(
recaptcha) else [])
self.display.exit(
"Login: unable to perform auth login to Safari Books Online.\n" + self.display.SH_YELLOW +
"[*]" + self.display.SH_DEFAULT + " Details:\n" + "%s" % "\n".join(
messages if len(messages) else [" Unexpected error!"])
)
except (html.etree.ParseError, html.etree.ParserError) as parsing_error:
self.display.error(parsing_error)
self.display.exit(
Expand All @@ -509,6 +504,17 @@ def do_login(self, email, password):
if response == 0:
self.display.exit("Login: unable to reach Safari Books Online. Try again...")

def check_login(self):
response = self.requests_provider(PROFILE_URL, perform_redirect=False)

if response == 0:
self.display.exit("Login: unable to reach Safari Books Online. Try again...")

if response.status_code != 200:
self.display.exit("Authentication issue: unable to access profile page.")

self.display.info("Successfully authenticated.", state=True)

def get_book_info(self):
response = self.requests_provider(self.api_url)
if response == 0:
Expand Down Expand Up @@ -548,7 +554,7 @@ def get_book_chapters(self, page=1):
return result + (self.get_book_chapters(page + 1) if response["next"] else [])

def get_default_cover(self):
response = self.requests_provider(self.book_info["cover"], update_cookies=False, stream=True)
response = self.requests_provider(self.book_info["cover"], stream=True)
if response == 0:
self.display.error("Error trying to retrieve the cover: %s" % self.book_info["cover"])
return False
Expand Down Expand Up @@ -765,7 +771,7 @@ def create_dirs(self):

def save_page_html(self, contents):
self.filename = self.filename.replace(".html", ".xhtml")
open(os.path.join(self.BOOK_PATH, "OEBPS", self.filename), "wb")\
open(os.path.join(self.BOOK_PATH, "OEBPS", self.filename), "wb") \
.write(self.BASE_HTML.format(contents[0], contents[1]).encode("utf-8", 'xmlcharrefreplace'))
self.display.log("Created: %s" % self.filename)

Expand Down Expand Up @@ -815,7 +821,7 @@ def _thread_download_css(self, url):
self.display.css_ad_info.value = 1

else:
response = self.requests_provider(url, update_cookies=False)
response = self.requests_provider(url)
if response == 0:
self.display.error("Error trying to retrieve this CSS: %s\n From: %s" % (css_file, url))

Expand All @@ -838,9 +844,7 @@ def _thread_download_images(self, url):
self.display.images_ad_info.value = 1

else:
response = self.requests_provider(urljoin(SAFARI_BASE_URL, url),
update_cookies=False,
stream=True)
response = self.requests_provider(urljoin(SAFARI_BASE_URL, url), stream=True)
if response == 0:
self.display.error("Error trying to retrieve this image: %s\n From: %s" % (image_name, url))

Expand All @@ -854,7 +858,7 @@ def _thread_download_images(self, url):
def _start_multiprocessing(self, operation, full_queue):
if len(full_queue) > 5:
for i in range(0, len(full_queue), 5):
self._start_multiprocessing(operation, full_queue[i:i+5])
self._start_multiprocessing(operation, full_queue[i:i + 5])

else:
process_queue = [Process(target=operation, args=(arg,)) for arg in full_queue]
Expand All @@ -879,7 +883,8 @@ def collect_images(self):
if self.display.book_ad_info == 2:
self.display.info("Some of the book contents were already downloaded.\n"
" If you want to be sure that all the images will be downloaded,\n"
" please delete the output direcotry '" + self.BOOK_PATH + "' and restart the program.")
" please delete the output direcotry '" + self.BOOK_PATH +
"' and restart the program.")

self.display.state_status.value = -1

Expand Down Expand Up @@ -1056,21 +1061,23 @@ def create_epub(self):
args_parsed = arguments.parse_args()

if args_parsed.cred or args_parsed.login:
email = ""
user_email = ""
pre_cred = ""

if args_parsed.cred:
pre_cred = args_parsed.cred

else:
email = input("Email: ")
user_email = input("Email: ")
passwd = getpass.getpass("Password: ")
pre_cred = email + ":" + passwd
pre_cred = user_email + ":" + passwd

parsed_cred = SafariBooks.parse_cred(pre_cred)

if not parsed_cred:
arguments.error("invalid credential: %s" % (args_parsed.cred if args_parsed.cred else (email + ":*******")))
arguments.error("invalid credential: %s" % (
args_parsed.cred if args_parsed.cred else (user_email + ":*******")
))

args_parsed.cred = parsed_cred

Expand All @@ -1079,4 +1086,5 @@ def create_epub(self):
arguments.error("invalid option: `--no-cookies` is valid only if you use the `--cred` option")

SafariBooks(args_parsed)
# Hint: do you want to download more then one book once, initialized more than one instance of `SafariBooks`...
sys.exit(0)

0 comments on commit 32bbe89

Please sign in to comment.