haaretz_images.recipe

# vim:fileencoding=utf-8
"""
A Calibre recipe for http://www.haaretz.co.il
"""

__license__ = 'GPL v3'
__copyright__ = '2015, Guy Rutenberg (http://www.guyrutenberg.com/contact-me)'

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
import re
import urllib

class HaaretzRecipe(BasicNewsRecipe):
    title = u'הארץ'
    __author__ = "Guy Rutenberg"
    language = 'he'
    max_articles_per_feed = 100
    auto_cleanup = False
    use_embedded_content = False
    ignore_duplicate_articles = {'url'}

    # Uncomment/comment feeds based on your liking.
    feeds = [
        (u'סוף השבוע', 'http://www.haaretz.co.il/cmlink/%D7%A1%D7%95%D7%A3-%D7%A9%D7%91%D7%95%D7%A2-1.1475630'),
        (u'מאמרים ודעות', 'http://www.haaretz.co.il/cmlink/1.1472481'),
        (u'מגזין', 'http://www.haaretz.co.il/cmlink/1.1617677'),
        # (u'גלריה', 'http://www.haaretz.co.il/cmlink/1.1472287'),
        (u'מדע וסביבה', 'http://www.haaretz.co.il/cmlink/1.1472258'),
        # (u'ספרים', 'http://www.haaretz.co.il/cmlink/1.1478711'),
        # (u'קפטן אינטרנט', 'http://www.haaretz.co.il/cmlink/1.1478711'),
    ]

    # If True, it will add a QR code with the article's URL at the
    # end of each article.
    qr_code = True

    extra_css = """
        body {direction:rtl;}
        .imageInfo .imageText {font-size: 0.8em;}
        .imageInfo .credit {font-size: 0.7em;}"""
    timefmt = '%d/%m/%Y'

    no_stylesheets = True
    encoding = 'utf-8'
    keep_only_tags = [
        dict(name='article', attrs={'class': 'article  has-l-fixed-column'}),
        dict(name='article', attrs={'class': None}),
        dict(name='p', attrs={'class':'qr-code'}),
    ]

    remove_tags = [
        dict(name='nav'),
        dict(name='aside'),
        dict(name='div', attrs={'class': 'art__tools'}),
        dict(name='div', attrs={'class': 'bg h-group'}), # Magazine pages
        dict(name='div', attrs={'class': 'rel-art'}),
        dict(name='footer'),
    ]

    def preprocess_html(self, soup):
        # Remove comments and stuff
        section = soup.find('section', attrs={'class': 'article__entry'})
        article_div = soup.find('div', attrs={'class': 'l-article__entry'})
        article_div.replaceWith(section)


        for picture in soup.findAll('picture'):
            picture.img['src'] = picture.find('source',attrs={'media':''})['srcset']
            picture.replaceWith(picture.img)
        return soup

    # Add a QR-code for sharing
    @staticmethod
    def get_qr_code_img(data, size='450x450'):
        # The URL of the http://goqr.me/ service
        qr_api_url = 'https://api.qrserver.com/v1/create-qr-code/?'
        qr_img = Tag(BeautifulSoup(), 'img')
        parameters = urllib.urlencode({'data': data, 'size': size})
        qr_img['src'] = qr_api_url + parameters
        qr_img['style'] = 'width: 3cm; height: 3cm;'
        return qr_img


    # The \s is needed because Haaretz embeds each video 3 times, the
    # first 2 times are broken.
    video_regex = re.compile(r'\s<iframe.*src="(?P<url>[^"]*)".*</iframe>')

    def preprocess_raw_html(self, raw_html, url):
        """The HTML dom tree is broken, we need to fix it."""

        # remove BOM markers
        raw_html = raw_html.replace('\xef\xbb\xbf','')
        regexs = [
            re.compile(r'<!--\[if.*?if\]-->', re.DOTALL),
            re.compile(r'<!--.*?-->', re.DOTALL),
            re.compile(r'<script.*?</script>', re.DOTALL),
            re.compile(r'<style.*?</style>', re.DOTALL),
        ]
        for r in regexs:
            raw_html = r.sub("", raw_html)

        if self.qr_code:
            soup = BeautifulSoup(raw_html)
            # replace videos with QR code
            for div in soup.findAll('div', attrs={'class': re.compile(r'(video|innerVideo)[\s$]')}):
                video_url = 'http:' + div.iframe['src']
                qr_img = self.get_qr_code_img(video_url)
                div.replaceWith('<p class="qr-code" style="text-align:center;">Video<br/>%s</p>' % qr_img)

            # Add QR code with article's URL
            qr_img = self.get_qr_code_img(url)
            soup.body.append('<p class="qr-code" style="text-align:center;">Share article<br/>%s</p>' % qr_img)
            raw_html = soup.prettify()
        return raw_html


    needs_subscription = True
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open('http://www.haaretz.co.il')
        if self.username is not None and self.password is not None:
            data = urllib.urlencode({
                'userName': self.username,
                'password': self.password,
                'newsso': 'true',
                'fromlogin': 'true',
                'layer': 'login',
                'site': '80',
            })
            # The login page returns a list of URLs that need to be visited
            # in order to set the authentication cookies for each domain.
            res = br.open('https://sso.themarker.com/sso/sso/signIn', data)
            response = res.read()
            response_prefix = "null('success',"
            if not response.startswith(response_prefix):
                raise RuntimeError("Can't sign in. Verify that the username and password are correct.")
            for url in re.findall(r'https://[^"]*', response):
                br.open(url)
        return br