-
Notifications
You must be signed in to change notification settings - Fork 4
/
archiveIt.py
74 lines (65 loc) · 4.14 KB
/
archiveIt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import requests, json, secrets, authenticate, runtime
# provide instructions
print('This script is used to generate new digital objects within an ArchivesSpace collection for websites crawled in an Archive-It collection. Please note: This is a "proof of concept" script, NOT completed work. Do not use in production scenarios.')
input('Press Enter to continue...')
# This is where we connect to ArchivesSpace. See authenticate.py
baseURL, headers = authenticate.login()
# archiveit_coll = raw_input('Enter the Archive-It collection number: ')
archiveit_coll = '3181'
# search AS for archival_object's with level "Web archive"
query = '/search?page=1&filter={"query":{"jsonmodel_type":"boolean_query","op":"AND","subqueries":[{"jsonmodel_type":"field_query","field":"primary_type","value":"archival_object","literal":true},{"jsonmodel_type":"field_query","field":"level","value":"Web archive","literal":true},{"jsonmodel_type":"field_query","field":"types","value":"pui","literal":true}]}}'
ASoutput = requests.get(baseURL + query, headers=headers).json()
print('Found ' + str(len(ASoutput['results'])) + ' archival objects with the instance type "Web archive."')
# grab needed fields out of ao
for ao in ASoutput['results']:
url = ao['title']
uri = ao['uri']
# search AI and get json of crawls for url listed in AS ao's title field
request = 'http://wayback.archive-it.org/' + archiveit_coll + '/timemap/json/' + url
AIoutput = requests.get(request).json()
print('Found ' + str(len(AIoutput)-1) + ' Archive-It crawls of ' + url + '.')
# take AI json lists and convert to python dicts
keys = AIoutput[0]
crawlList = []
for i in range (1, len (AIoutput)):
AIlist = AIoutput[i]
crawl = {}
for j in range (0, len(AIlist)):
crawl[keys[j]] = AIlist[j]
crawlList.append(crawl)
# construct digital object json from Archive-It output and post to AS
print('The following digital objects have been created in ArchivesSpace:')
newInstances = []
for crawl in crawlList:
doid = 'https://wayback.archive-it.org' + '/' + archiveit_coll + '/' + crawl['timestamp'] + '/' + crawl['original']
query = '/search?page=1&filter={"query":{"jsonmodel_type":"boolean_query","op":"AND","subqueries":[{"jsonmodel_type":"field_query","field":"primary_type","value":"digital_object","literal":true},{"jsonmodel_type":"field_query","field":"digital_object_id","value":"' + doid + '","literal":true}]}}'
existingdoID = requests.get(baseURL + query, headers=headers).json()
doPost = {}
if len(existingdoID['results']) != 0:
print('Digital object already exists.')
else:
doPost['digital_object_id'] = doid
doPost['title'] = 'Web crawl of ' + crawl['original']
doPost['dates'] = [{'expression': crawl['timestamp'], 'date_type': 'single', 'label': 'creation'}]
doPost['file_versions'] = [{'file_uri': crawl['filename'], 'checksum': crawl['digest'], 'checksum_method': 'sha-1'}]
doJson = json.dumps(doPost)
if doPost != {}:
post = requests.post(baseURL + '/repositories/2/digital_objects', headers=headers, data=doJson).json()
print(post)
doItem = {}
doItem['digital_object'] = {'ref': post['uri']}
doItem['instance_type'] = 'digital_object'
newInstances.append(doItem)
aoGet = requests.get(baseURL + uri, headers=headers).json()
existingInstances = aoGet['instances']
existingInstances = existingInstances + newInstances
aoGet['instances'] = existingInstances
aoUpdate = requests.post(baseURL + uri, headers=headers, data=json.dumps(aoGet)).json()
print('The following archival objects have been updated in ArchivesSpace:')
print(aoUpdate)
# TO DO LATER
# Parse dates for ArchivesSpace record, push to AOs above
# Add phystech stating "Archived website" to ASpace resource record
# Add "Web sites" subject tracing to ASpace resource record
# Deal with the fact that this should be able to be run for multiple AI collections (at present limited to one declared in script)
# Improve logic for determining whether something is a duplicate