-
Notifications
You must be signed in to change notification settings - Fork 0
/
haaretz_images.recipe
140 lines (120 loc) · 5.32 KB
/
haaretz_images.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# vim:fileencoding=utf-8
"""
A Calibre recipe for http://www.haaretz.co.il
"""
__license__ = 'GPL v3'
__copyright__ = '2015, Guy Rutenberg (http://www.guyrutenberg.com/contact-me)'
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
import re
import urllib
class HaaretzRecipe(BasicNewsRecipe):
title = u'הארץ'
__author__ = "Guy Rutenberg"
language = 'he'
max_articles_per_feed = 100
auto_cleanup = False
use_embedded_content = False
ignore_duplicate_articles = {'url'}
# Uncomment/comment feeds based on your liking.
feeds = [
(u'סוף השבוע', 'http://www.haaretz.co.il/cmlink/%D7%A1%D7%95%D7%A3-%D7%A9%D7%91%D7%95%D7%A2-1.1475630'),
(u'מאמרים ודעות', 'http://www.haaretz.co.il/cmlink/1.1472481'),
(u'מגזין', 'http://www.haaretz.co.il/cmlink/1.1617677'),
# (u'גלריה', 'http://www.haaretz.co.il/cmlink/1.1472287'),
(u'מדע וסביבה', 'http://www.haaretz.co.il/cmlink/1.1472258'),
# (u'ספרים', 'http://www.haaretz.co.il/cmlink/1.1478711'),
# (u'קפטן אינטרנט', 'http://www.haaretz.co.il/cmlink/1.1478711'),
]
# If True, it will add a QR code with the article's URL at the
# end of each article.
qr_code = True
extra_css = """
body {direction:rtl;}
.imageInfo .imageText {font-size: 0.8em;}
.imageInfo .credit {font-size: 0.7em;}"""
timefmt = '%d/%m/%Y'
no_stylesheets = True
encoding = 'utf-8'
keep_only_tags = [
dict(name='article', attrs={'class': 'article has-l-fixed-column'}),
dict(name='article', attrs={'class': None}),
dict(name='p', attrs={'class':'qr-code'}),
]
remove_tags = [
dict(name='nav'),
dict(name='aside'),
dict(name='div', attrs={'class': 'art__tools'}),
dict(name='div', attrs={'class': 'bg h-group'}), # Magazine pages
dict(name='div', attrs={'class': 'rel-art'}),
dict(name='footer'),
]
def preprocess_html(self, soup):
# Remove comments and stuff
section = soup.find('section', attrs={'class': 'article__entry'})
article_div = soup.find('div', attrs={'class': 'l-article__entry'})
article_div.replaceWith(section)
for picture in soup.findAll('picture'):
picture.img['src'] = picture.find('source',attrs={'media':''})['srcset']
picture.replaceWith(picture.img)
return soup
# Add a QR-code for sharing
@staticmethod
def get_qr_code_img(data, size='450x450'):
# The URL of the http://goqr.me/ service
qr_api_url = 'https://api.qrserver.com/v1/create-qr-code/?'
qr_img = Tag(BeautifulSoup(), 'img')
parameters = urllib.urlencode({'data': data, 'size': size})
qr_img['src'] = qr_api_url + parameters
qr_img['style'] = 'width: 3cm; height: 3cm;'
return qr_img
# The \s is needed because Haaretz embeds each video 3 times, the
# first 2 times are broken.
video_regex = re.compile(r'\s<iframe.*src="(?P<url>[^"]*)".*</iframe>')
def preprocess_raw_html(self, raw_html, url):
"""The HTML dom tree is broken, we need to fix it."""
# remove BOM markers
raw_html = raw_html.replace('\xef\xbb\xbf','')
regexs = [
re.compile(r'<!--\[if.*?if\]-->', re.DOTALL),
re.compile(r'<!--.*?-->', re.DOTALL),
re.compile(r'<script.*?</script>', re.DOTALL),
re.compile(r'<style.*?</style>', re.DOTALL),
]
for r in regexs:
raw_html = r.sub("", raw_html)
if self.qr_code:
soup = BeautifulSoup(raw_html)
# replace videos with QR code
for div in soup.findAll('div', attrs={'class': re.compile(r'(video|innerVideo)[\s$]')}):
video_url = 'http:' + div.iframe['src']
qr_img = self.get_qr_code_img(video_url)
div.replaceWith('<p class="qr-code" style="text-align:center;">Video<br/>%s</p>' % qr_img)
# Add QR code with article's URL
qr_img = self.get_qr_code_img(url)
soup.body.append('<p class="qr-code" style="text-align:center;">Share article<br/>%s</p>' % qr_img)
raw_html = soup.prettify()
return raw_html
needs_subscription = True
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.open('http://www.haaretz.co.il')
if self.username is not None and self.password is not None:
data = urllib.urlencode({
'userName': self.username,
'password': self.password,
'newsso': 'true',
'fromlogin': 'true',
'layer': 'login',
'site': '80',
})
# The login page returns a list of URLs that need to be visited
# in order to set the authentication cookies for each domain.
res = br.open('https://sso.themarker.com/sso/sso/signIn', data)
response = res.read()
response_prefix = "null('success',"
if not response.startswith(response_prefix):
raise RuntimeError("Can't sign in. Verify that the username and password are correct.")
for url in re.findall(r'https://[^"]*', response):
br.open(url)
return br