-
Notifications
You must be signed in to change notification settings - Fork 3
/
warcunpack_ia.py
executable file
·217 lines (159 loc) · 7.36 KB
/
warcunpack_ia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python
"""warcextract - dump warc record context to directory"""
import os
import sys
import os.path
import uuid
import mimetypes
import shlex
from optparse import OptionParser
from contextlib import closing
from urlparse import urlparse
from hanzo.warctools import ArchiveRecord, WarcRecord
from hanzo.httptools import RequestMessage, ResponseMessage
mimetypes.add_type('text/javascript', 'js')
parser = OptionParser(usage="%prog [options] warc offset")
parser.add_option("-D", "--default-name", dest="default_name")
parser.add_option("-o", "--output", dest="output")
parser.add_option("-l", "--log", dest="log_file")
parser.add_option("-W", "--wayback_prefix", dest="wayback")
parser.set_defaults(output=None, log_file=None, default_name='crawlerdefault', wayback="http://wayback.archive-it.org/")
def log_headers(log_file):
print >> log_file, '>>warc_file\twarc_id\twarc_type\twarc_content_length\twarc_uri_date\twarc_subject_uri\turi_content_type\toutfile\twayback_uri'
def log_entry(log_file, input_file, record, content_type, output_file, wayback_uri):
log = (input_file, record.id, record.type, record.content_length, record.date, record.url, content_type, output_file, wayback_uri)
print >> log_file, "\t".join(str(s) for s in log)
def main(argv):
(options, args) = parser.parse_args(args=argv[1:])
out = sys.stdout
if options.output:
if not os.path.exists(options.output):
os.makedirs(options.output)
output_dir = options.output
else:
output_dir = os.getcwd()
collisions = 0
if len(args) < 1:
log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb')
log_headers(log_file)
with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback)
else:
for filename in args:
log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file
log_file = open(log_file, 'wb')
log_headers(log_file)
try:
with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback)
except StandardError, e:
print >> sys.stderr, "exception in handling", filename, e
if collisions:
print >> sys.stderr, collisions, "filenames that collided"
return 0
def unpack_records(name, fh, output_dir, default_name, output_log, wayback_prefix):
collectionId = ''
collisions = 0
for (offset, record, errors) in fh.read_records(limit=None):
if record:
try:
content_type, content = record.content
if record.type == WarcRecord.WARCINFO:
info = parse_warcinfo(record)
for entry in shlex.split(info.get('description', "")):
if entry.startswith('collectionId'):
collectionId = entry.split('=',1)[1].split(',')[0]
if not collectionId:
filename = record.get_header("WARC-Filename")
if filename:
collectionId = filename.split(r'-')[1]
elif '-' in name:
collectionId = name.split(r'-')[1]
if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'):
code, mime_type, message = parse_http_response(record)
if 200 <= code < 300:
filename, collision = output_file(output_dir, record.url, mime_type, default_name)
if collision:
collisions+=1
wayback_uri = ''
if collectionId:
wayback_date = record.date.translate(None,r'TZ:-')
wayback_uri = wayback_prefix + collectionId + '/' + wayback_date + '/' + record.url
with open(filename, 'wb') as out:
out.write(message.get_body())
log_entry(output_log, name, record, mime_type, filename, wayback_uri)
except StandardError, e:
import traceback; traceback.print_exc()
print >> sys.stderr, "exception in handling record", e
elif errors:
print >> sys.stderr, "warc errors at %s:%d"%(name, offset if offset else 0),
for e in errors:
print >> sys.stderr , e,
print >>sys.stderr
return collisions
def parse_warcinfo(record):
info = {}
try:
for line in record.content[1].split('\n'):
line = line.strip()
if line:
try:
key, value =line.split(':',1)
info[key]=value
except StandardError, e:
print >>sys.stderr, 'malformed warcinfo line', line
except StandardError, e:
print >>sys.stderr, 'exception reading warcinfo record', e
return info
def parse_http_response(record):
message = ResponseMessage(RequestMessage())
remainder = message.feed(record.content[1])
message.close()
if remainder or not message.complete():
if remainder:
print >>sys.stderr, 'warning: trailing data in http response for', record.url
if not message.complete():
print >>sys.stderr, 'warning: truncated http response for', record.url
header = message.header
mime_type = [v for k,v in header.headers if k.lower() =='content-type']
if mime_type:
mime_type = mime_type[0].split(';')[0]
else:
mime_type = None
return header.code, mime_type, message
def output_file(output_dir, url, mime_type, default_name):
clean_url = "".join((c if c.isalpha() or c.isdigit() or c in '_-/.' else '_') for c in url.replace('://','/',1))
parts = clean_url.split('/')
directories, filename = parts[:-1], parts[-1]
path = [output_dir]
for d in directories:
if d:
path.append(d)
if filename:
name, ext = os.path.splitext(filename)
else:
name, ext = default_name, ''
if mime_type:
guess_type = mimetypes.guess_type(url)
# preserve variant file extensions, rather than clobber with default for mime type
if not ext or guess_type != mime_type:
mime_ext = mimetypes.guess_extension(mime_type)
if mime_ext:
ext = mime_ext
elif not ext:
ext = '.html' # no mime time, no extension
directory = os.path.normpath(os.path.join(*path))
directory = directory[:200]
if not os.path.exists(directory):
os.makedirs(directory)
filename = name[:45-len(ext)] + ext
fullname = os.path.join(directory, filename)
collision = False
while os.path.exists(fullname):
collision = True
u = str(uuid.uuid4())[:8]
filename = name[:45-len(ext)] + '_R'+ u + ext
fullname = os.path.join(directory, filename)
return os.path.realpath(os.path.normpath(fullname)), collision
if __name__ == '__main__':
sys.exit(main(sys.argv))