-
Notifications
You must be signed in to change notification settings - Fork 3
/
deltajson_customexample.py
executable file
·126 lines (103 loc) · 4.75 KB
/
deltajson_customexample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/python
import json
import sys
import time
import hashlib
# ijson with native yajl backend (needs native yajl lib installation, see README)
import ijson.backends.yajl2_cffi as ijson
# pure python variant (ca. 66% slower overall):
# import ijson
# This is an example that shows that it's pretty easy to build custom JSON stream parsers
# This example takes a feed that has an array of JSONs named "markets", but the fingerprinting
# is done for "products" entries that are contained in an array per market.
# since it's custom parsing it's not using JSONPath as an abstraction.
#
# call me as deltajson_customexample.py {youCustomFeedFile}.json
startTime = time.time()
if len(sys.argv) <= 1:
print('new json file name param is mandatory')
exit()
fullfile_name = sys.argv[1]
deltafile_name = fullfile_name + '.changes.json'
fingerprintsfile_new_name = fullfile_name + '.fingerprints.json'
fingerprintsfile_old_name = ""
if len(sys.argv) > 2:
fingerprintsfile_old_name = sys.argv[2]
if fingerprintsfile_new_name == fingerprintsfile_old_name:
print(
'ERROR: last fingerprints file name must differ from new name ' + fingerprintsfile_new_name)
exit()
with open(
fullfile_name, 'rb') as fullfile_new, open(
deltafile_name, 'wb') as deltafile, open(
fingerprintsfile_new_name , 'w') as fingerprintsfile_new:
if fingerprintsfile_old_name:
try:
fingerprintsfile_old = open(fingerprintsfile_old_name, 'r')
fingerprints_old = json.load(fingerprintsfile_old)
except IOError:
print('ERROR: could not open file ' + fingerprintsfile_old_name)
exit()
else:
print('INFO: no old fingerprints file name passed, starting from scratch')
fingerprints_old = dict()
fingerprints_new = dict()
idSet = set() # to check uniqueness. Faster than using a list or dict.
duplicateIds = list()
# CUSTOM IMPLEMENTATION FROM HERE
jsonObjects = ijson.items(fullfile_new, 'messages.item.markets.item')
deltafile.write('{"markets":[\n')
objCount = 0
deltacount = 0
marketcount = 0
# Half-streaming way: parse the complete JSON of a market and iterate over products inside that.
# (full streaming would be pretty complex concerning how to
markets = (o for o in jsonObjects)
for market in markets:
prodcount = 0
if marketcount > 0: deltafile.write('\n,')
marketcount += 1
marketId = str(market['wwIdent'])
print("found market " + str(marketcount) + " : " + marketId)
deltafile.write('{"wwwIdent": "' + marketId + '", "products": [\n')
for obj in market['products']:
objCount += 1
objNr = str(obj['nan'])
objId = marketId + '-' + objNr
if objId in idSet: # ignore and remember duplicate ids
duplicateIds.append(objId)
else:
idSet.add(objId)
objJsonString = json.dumps(obj)
objDigest = hashlib.md5(objJsonString).hexdigest()
fingerprints_new[objId] = objDigest
# if the obj is new or the obj has changed, write delta.
# (removes items from old fingerprints to find implicit deletions)
if (objId not in fingerprints_old) or (fingerprints_old.pop(objId) != objDigest):
if prodcount > 0: deltafile.write('\n,')
deltacount += 1
prodcount += 1
deltafile.write(objJsonString)
deltafile.write('\n]}')
deltafile.write('\n]}')
# END OF CUSTOMIZED PART
print('DONE: processed ' + '{:,}'.format(objCount) + ' JSON objects, ' + '{:,}'.format(
len(idSet)) + ' unique IDs, found ' + '{:,}'.format(deltacount) + ' changed and ' + '{:,}'.format(
len(fingerprints_old)) + ' removed entries.')
# log duplicate ids
if len(duplicateIds) > 0:
print('WARN: ' + '{:,}'.format(
len(duplicateIds)) + ' duplicate IDs found. Used only first occurrences, writing to file')
with open(fullfile_name + '.duplicateIds.json', 'w') as duplicateIds_file:
json.dump(duplicateIds, duplicateIds_file, indent=2)
# write deleted fingerprints if some remained:
if len(fingerprints_old) > 0:
print('INFO: some entries have disappeared since the last file. Writing IDs to file')
with open(fullfile_name + '.removedIds.json', 'w') as removedIds_file:
json.dump(fingerprints_old, removedIds_file, indent=2)
# persist new fingerprints and deltafile:
deltafile.flush()
print('wrote delta file')
json.dump(fingerprints_new, fingerprintsfile_new)
print('wrote new fingerprints file')
print('duration: ' + str(time.time() - startTime) + ' seconds')