-
Notifications
You must be signed in to change notification settings - Fork 0
/
FileCollection.py
253 lines (201 loc) · 8.31 KB
/
FileCollection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""
The super class FileCollection is used to handle json file loading/saving and database ingestion through Change Management System
"""
import urllib.request, json
import ssl
import os.path
from os import path
from datetime import datetime
class FileCollection:
'''
Control the REST requests and the passed params
'''
def __init__(self,props):
# take the end point and start loading the data
for p in props:
setattr(self, p, props[p])
self.start=1
self.page = 1
self.total=None
self.folder = self.org_name
self.loaded_resource_ids = []
if not path.exists(self.path+self.folder):
os.mkdir(self.path+self.folder)
self.open_prefix = self.end_point_url[:self.end_point_url.index("/api")]
self.load_results()
def load_results(self):
# declare the folder and file names
folder_path=self.path+self.folder+"/"
file=self.org_name+".json"
_file = folder_path + file
#check if the data exists
url = self.end_point_url
self.load_file_call_func( _file, url,'check_loaded')
def check_loaded(self,data, parent_obj=False):
"""
:param data:
:param parent_obj:
:return:
"""
# scan the json looking for how many records have been downloaded
# can setup the next request if there are more pages to be downloaded
self.drill_loaded_data(data)
def drill_loaded_data(self, data):
"""
:param data:
:return:
"""
# start by making sure a 'layers' folder exists
layers_path = self.path + self.folder + "/layers"
if not path.exists(layers_path):
os.mkdir(layers_path)
for index, r in enumerate(data['items']):
# todo - remove when done testing
# if index >1:
# break
id = r['itemId']
if self.resource_ids:
# only load specified resource ids
for r_id in self.resource_ids:
if r_id == id:
self.load_data(id, r, layers_path)
else:
self.load_data(id, r, layers_path)
def load_file_call_func(self, _file, _url, _func, parent_obj=False):
"""
:param _file: the name (w/ path) of the file to save
:param _url: the absolute URL to the json
:param _func: The function to call upon completion
:param parent_obj: extra info to retain when loading
:return: None
"""
if not path.exists(_file) or self.overwrite:
# setup the url to load each request
print("loading file", _url)
if _url.startswith("//"):
_url="https:"+_url
urllib.request.urlretrieve(_url, _file)
try:
context = ssl._create_unverified_context()
response = urllib.request.urlopen(_url, context=context)
with open(_file, 'w', encoding='utf-8') as outfile:
try:
outfile.write(json.dumps(json.loads(response.read().decode('utf-8')), indent=4, sort_keys=True))
except:
outfile.write(response.read().decode('utf-8'))
self.load_file(_file, _func, parent_obj)
except ssl.CertificateError as e:
print("Data portal URL does not exist: " + _url)
else:
# load the file
self.load_file(_file,_func, parent_obj)
def load_data(self,id,r,layers_path):
"""
:param id:
:param r:
:param layers_path:
:return:
"""
_file = layers_path + "/" + id + ".json"
# use the 'thumbnailUri' excluding the end to consistently load metadata for both 'compoundobject' and 'singleitem'
_url = self.open_prefix + r['thumbnailUri'][:r['thumbnailUri'].index("/thumbnail")]
print(_url)
self.load_file_call_func(_file, _url, 'check_sub_loaded', r)
def check_sub_loaded(self,data,parent_obj):
"""
We're going a level deeper here and looking at the layers associated with a record
We'll create only parent records and associate the children (if they exist beneath).
:param data: the sub information to be used in creating more informative compound records
:param parent_obj:
:return:
"""
layers_path = self.path + self.folder + "/layers"
if "page" in data['objectInfo'] or "node" in data['objectInfo'] :
print("There are children here ---------------")
# todo - associate the children - all details exist in the 'data'
#generate urls for all children
root_path = self.open_prefix+data['thumbnailUri'][:data['thumbnailUri'].index("/id/")+4]
if "node" in data['objectInfo']:
child_list = data['objectInfo']["node"]["page"]
else:
#make sure there are more than 1 pages
if isinstance(data['objectInfo']["page"], list):
child_list=data['objectInfo']["page"]
else:
child_list = [data['objectInfo']["page"]]
for index, p in enumerate(child_list):
# todo get the parent id
# create a child resource with only new information - the ingest should take the parent info and combine it with the child
parent_id = parent_obj["itemId"]
item_id = p["pageptr"]
_file = layers_path + "/" + parent_id + "_sub_"+item_id+".json"
_url = root_path+item_id
# todo - decide if we want to save the 3 metadata files
print("About to load child url",_url)
self.load_file_call_func(_file, _url, 'check_sub_sub_loaded', parent_obj)
self.harvester.warp_it_up()
def check_sub_sub_loaded(self,data, parent_obj):
"""
:param data:
:param parent_obj:
:return:
"""
# now that we've loaded the child information look for the lat and lng
# start by creating a temp object to storethe data
obj={
"thumb_url":self.open_prefix+data["thumbnailUri"],
"info_page": self.open_prefix + data["thumbnailUri"][data["thumbnailUri"].index("/collection"):data["thumbnailUri"].index("/thumbnail")],
"lat":None,
"lng":None
}
# start by looking for the keys in the parent to map onto the child
for f in parent_obj['metadataFields']:
for r in self.retain_keys:
if f['field'] == r:
obj[r]=f["value"]
#now pull from the child
for f in data["fields"]:
if f["key"] == self.lat:
obj["lat"]=f["value"]
if f["key"] == self.lng:
obj["lng"]=f["value"]
for r in self.retain_keys:
if f["key"] == r:
obj[r]=f["value"]
if obj["lat"] and obj["lng"]:
self.harvester.create_feature(obj)
def load_file(self,_file, _func, parent_obj=False):
"""
:param _file: : The name (w/ path) of the file to save - relayed from "load_file_call_func"
:param _func: _func: The function to call upon completion - relayed from "load_file_call_func"
:param parent_obj: Extra info to retain when loading - relayed from "load_file_call_func"
:return: None
"""
try:
_json = self.open_json(_file)
except:
_json = self.parse_json(_file)
getattr(self, _func)(_json, parent_obj)
def open_json(self,_file):
"""
:param _file: The name (w/ path) of the local file to open
:return: interprets text as JSON
"""
try:
outfile = open(_file)
_json = json.load(outfile)
outfile.close()
return _json
except:
raise
def parse_json(self,_file):
"""
Extracts the JSON form malformed file
:param _file: The name (w/ path) of the local file to open
:return: A JSON file
"""
outfile = open(_file)
s = outfile.read()
_json = s[s.find("(") + 1:s.rfind(")")]
outfile.close()
return json.loads(_json)