-
Notifications
You must be signed in to change notification settings - Fork 0
/
objects.py
87 lines (74 loc) · 2.85 KB
/
objects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import re
import requests
from bs4 import BeautifulSoup
from urlparse import urlparse
from utils import check_if_path_exists
from utils import get_background_url_from_tag
from utils import get_logger
logger = get_logger()
class Album:
def __init__(self, url):
self.url = url
self.id = ''
self.title = ''
self.date = ''
self.images = []
self.set_id_from_url()
self.set_images_and_metadata()
def set_id_from_url(self):
pattern = '\/([A-Za-z0-9]+)$'
rgx = re.search(pattern, self.url)
self.id = rgx.group(1)
def set_images_and_metadata(self):
url = 'http://web.archive.org{0}'.format(self.url)
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# set title
header = soup.find('div', {'class': 'header'})
self.title = header.h1.get_text()
# set date
meta_info = soup.find('ul', {'class': 'meta-info'})
self.date = meta_info.find_all('li')[-1].get_text() \
.replace('Album created: ', '')
# set images
tags = soup.find_all('a', {'class': 'thumb'})
for tag in tags:
thumb_url = get_background_url_from_tag(tag)
image = Image(thumb_url)
self.images.append(image)
logger.info('added {0} photos to album "{1}"'
.format(len(self.images), self.title))
def save_images(self):
"""For each Image associated with an Album, make a web request and save it
to a local directory"""
folder = 'output/{0}'.format(self.id)
infofile = '{0}/album-info.txt'.format(folder)
check_if_path_exists(infofile)
with open(infofile, 'wb') as f:
f.write('{title}\n{count}\n{date}\n{url}'.format(
title=self.title,
count='{0} photos'.format(len(self.images)),
date=self.date,
url='http://web.archive.org{0}'.format(self.url),
))
for image in self.images:
r = requests.get('http://web.archive.org{0}'.format(image.url))
if r.status_code != 200:
continue
with open('{0}/{1}'.format(folder, image.filename), 'wb') as f:
for chunk in r:
f.write(chunk)
class Image:
def __init__(self, thumb_url):
self.thumb_url = thumb_url
self.set_url_from_thumb()
self.filename = os.path.split(urlparse(self.url).path)[-1]
def set_url_from_thumb(self):
pattern = '/web/(?:[a-z0-9_]+)/http://thumb\d+?.webshots.net/t/' \
'(\d+)([\/A-Za-z0-9]+)_th.jpg'
rgx = re.search(pattern, self.thumb_url)
extract = (rgx.group(1), rgx.group(2))
url = '/web/20121108151015im_/' \
'http:/image{0}.webshots.com/{1}_ph.jpg'
self.url = url.format(*extract)