-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdlg_json2csv_gui.py
293 lines (243 loc) · 13.9 KB
/
dlg_json2csv_gui.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
"""
Parses JSON data from the DLG API into a CSV.
A GUI is used to run the script so users don't need to interact with the command line.
Threading is used so the GUI does not show an 'unresponsive' error because the script is running.
"""
# Future development: should the user be notified of errors that don't quit the script, besides having the log made?
import csv
import os
import pandas as pd
import PySimpleGUI as sg
import re
import requests
import sys
# For threading.
import threading
import gc
SCRIPT_THREAD = '-SCRIPT_THREAD-'
def dlg_json2list(url_list, output_location):
"""Gets the JSON from th DLG API for every value in the url_list and results it as a list.
Makes a log for details about any problems in the same folder as the output."""
json_list = []
for url in url_list:
# Creates an API URL from the provided URL. Adds .json if not present, which goes in a different location
# depending on if the provided URL is from a search or for a single item.
is_api_url = type(re.search('.json', url)) == re.Match
is_search_result = type(re.search(r'\?', url)) == re.Match
if not is_api_url:
if is_search_result:
api_url = re.sub(r'\?', '.json?', url)
else:
api_url = re.sub(r'$', '.json', url)
else:
api_url = url
# Grabbing the response JSON.
try:
response = requests.get(api_url)
json_dict = response.json()
except:
with open(f'{output_location}/error_log.txt', 'a') as log:
log.write('\n\nCould not get data from the DLG API for the following URL:')
log.write(url)
continue
# Saving the response JSON to json_list.
if not is_search_result:
json_list.append(json_dict['response']['document'])
# If the URL is a search query, then we need to grab every item on every page.
else:
total_pages = json_dict['response']['pages']['total_pages']
# Saves the results from the first page of the API call to the list.
for item in json_dict['response']['docs']:
json_list.append(item)
# If there are multiple pages, calculates the api_url for all the other pages and adds them to the list.
# Stops when the total number of pages is reached.
if total_pages > 1:
# Range produces a sequence of numbers from 2 - last page number.
for page in range(2, total_pages + 1):
# Create the api_url for the next page.
page_str = 'page=' + str(page)
if type(re.search(r'page=\d+', api_url)) == re.Match:
api_url = re.sub(r'page=\d+', page_str, api_url)
else:
# For the first iteration, which doesn't have 'page=\d' yet.
page_str = '?' + page_str + '&'
api_url = re.sub(r'\?', page_str, api_url)
# Grabbing the response and JSON for the new api_url.
try:
response = requests.get(api_url)
json_dict = response.json()
except:
with open(f'{output_location}/error_log.txt', 'a') as log:
log.write('\n\nCould not get data from the DLG API for the following page:')
log.write(f'Page: {page}, API URL: {api_url}')
continue
# Saves the response to the list.
for item in json_dict['response']['docs']:
json_list.append(item)
# Error Check. json_list should have 1 or more items inside. Otherwise exit.
if len(json_list) < 1:
with open(f'{output_location}/error_log.txt', 'a', ) as log:
log.write('\n\nCould not get any data from the DLG API for this request')
sg.Popup("Unable to get any data for the provided input. See error_log.txt in the output folder for more "
"information.")
sys.exit()
'''This loop with iterate through each item of json_list to convert each item into a string so when creating the
CSV, the excess quotation marks and brackets will go away. Plus we will handle the redirecting URLs and copyright
issues with replacing the item with the thumbnails. '''
for item in json_list:
for key in item.keys():
# Changing the list into one big string.
if type(item[key]) == list:
text = item[key][0]
for i in range(1, len(item[key])):
text += ', ' + item[key][i]
item[key] = text
# Changing the item URL.
if key == 'edm_is_shown_by':
# Thumbnails.
if item[key] is None:
thumbnail_url = 'https://dlg.galileo.usg.edu/'
try:
repo_id, collection_id, item_id = item['id'].split('_', 2)
thumbnail_url += repo_id + '/' + collection_id + '/do-th:' + item_id
except:
with open(f'{output_location}/error_log.txt', 'a') as log:
log.write(f'\n\nCould not parse the item id for the thumbnail url: {item["id"]}')
continue
# Now grabbing the redirected URL.
item[key] = requests.get(thumbnail_url).url
else:
# Grabbing the redirected item.
try:
item[key] = requests.get(item[key]).url
except:
with open(f'{output_location}/error_log.txt', 'a') as log:
log.write(f'\n\nCould not get redirected item: {item[key]}')
return json_list
def make_csv(url_file, csv_name, dlg_mapping, output_location, gui_window):
"""Creates a CSV of data from the DLG API for all specified items. """
# Grabbing all of the URLs in the file to then be parsed.
urls = []
with open(url_file, 'r') as dlg_urls:
for line in dlg_urls:
urls.append(line.strip())
# Grabbing the complete list of JSONs from the provided URLs and making a dataframe.
jsons = dlg_json2list(urls, output_location)
df = pd.DataFrame.from_dict(jsons)
# Initializing the DLG Mapping dict.
new_column_name = {}
# Grabbing the DLG Dublin Core Mapping.
with open(dlg_mapping, 'r') as map_csv:
w = csv.reader(map_csv)
for row in w:
new_column_name.update({row[0]: row[1]})
# Dropping columns from the dataframe if they are not in the DLG Mapping.
drop_columns = [col for col in list(df.columns) if col not in list(new_column_name.keys())]
df.drop(drop_columns, axis=1, inplace=True)
# Renaming the columns to map to Dublin Core and writing to CSV.
df.rename(columns=new_column_name, inplace=True)
df = df.sort_index(axis=1)
df.to_csv(csv_name, index=False)
# Communicate that the script has completed to user in the GUI dialogue box.
print(f"\nThe requested CSV has been made and is in the {output_location} folder. "
f"You may submit information to create another CSV or close this program.")
window.Refresh()
# For threading: indicates the thread for running the script is done.
gui_window.write_event_value('-SCRIPT_THREAD-', (threading.current_thread().name,))
# For threading. Disables garbage collecting, which is restarted with gc.collect() once the GUI starts.
gc.disable()
# Defines a GUI for users to provide the input needed for this script and
# to receive messages about errors to their inputs and the script progress.
sg.theme("DarkTeal6")
layout_one = [[sg.Text('Path to file with DLG URLs', font=("roboto", 13))],
[sg.Text('Folder to save output', font=("roboto", 13))],
[sg.Text('Name for the output CSV', font=("roboto", 13))],
[sg.Text(font=("roboto", 1))],
[sg.Submit(key="submit", disabled=False), sg.Cancel()]]
layout_two = [[sg.Input(key="input_file"), sg.FileBrowse()],
[sg.Input(key="output_folder"), sg.FolderBrowse()],
[sg.Input(key="output_name")],
[sg.Text(font=("roboto", 1))],
[sg.Text(font=("roboto", 13))]]
layout_three = [[sg.Text("Mapping", font=("roboto", 13)),
sg.Input(default_text="DLG_Mapping.csv", key="mapping_csv"), sg.FileBrowse()]]
layout = [[sg.Column(layout_one), sg.Column(layout_two)],
[sg.Frame("Optional", layout_three, font=("roboto", 15))],
[sg.Output(size=(90, 10))]]
window = sg.Window("DLG API Parser: Make a CSV from DLG Metadata", layout)
# Keeps the GUI open until the user quits the program. Receives user input, verifies the input,
# and when all input is correct runs the program.
# Future development: add a "reset" button to get the GUI back to original values?
while True:
# For threading: start garbage collecting.
gc.collect()
# Gets the user input data and saves the input values to their own variables for easier referencing in the script.
event, values = window.read()
# For threading: let the user submit new information now that the script thread is over.
if event == SCRIPT_THREAD:
window[f'{"submit"}'].update(disabled=False)
# If the user submitted values, tests they are correct. If not, errors are displayed. If yes, the script is run.
# Future development: change formatting on boxes with errors to highlight them?
if event == "submit":
# Communicate that the script is starting to the user in the GUI dialogue box.
print("\nPlease wait while the CSV you requested is generated...")
window.Refresh()
# Error testing on all of the user inputs. Required fields cannot be empty and paths must be valid.
# Errors are saved to a list so all values can be tested prior to notifying the user.
errors = []
if values["input_file"] == "":
errors.append("Input CSV can't be blank.")
if not os.path.exists(values["input_file"]):
errors.append("Input CSV path is not correct.")
if values["output_folder"] == "":
errors.append("Output folder cannot be blank.")
if not os.path.exists(values["output_folder"]):
errors.append("Output folder path is not correct.")
if values["output_name"] == "":
errors.append("Output name can't be blank.")
if values["mapping_csv"] == "":
errors.append("Mapping CSV can't be blank. Use DLG_Mapping.csv for the default.")
if not os.path.exists(values["mapping_csv"]):
errors.append("Mapping CSV path is not correct.")
# If the user inputs are correct, verifies if the output CSV exists and runs the script if it does not
# OR if the user agrees to overwrite the existing CSV. If the user does not want to overwrite an existing CSV,
# no CSV is made and the user must resubmit the input.
if len(errors) == 0:
# Makes a variable for the full path to the CSV for the output from two user inputs,
# including adding a ".csv" file extension if output_name does not already have one.
if not values["output_name"].endswith(".csv"):
values["output_name"] = values["output_name"] + ".csv"
output_csv = os.path.join(values["output_folder"], values["output_name"])
# If the CSV for the script output already exists, prompt the user to decide if it should be overwritten.
# If the user indicates yes, the script is run. Otherwise, the user can correct the input and resubmit.
if os.path.exists(output_csv):
override = sg.PopupYesNo("Do you want to replace the existing CSV?")
if override == "Yes":
# For threading: run make_csv() in a thread.
processing_thread = threading.Thread(target=make_csv, args=(values["input_file"], output_csv,
values["mapping_csv"],
values["output_folder"], window))
processing_thread.start()
# Disable the submit button while make_csv() is running so users can't overwhelm computing resources
# by requesting new CSVs before the first is done being created.
window[f'{"submit"}'].update(disabled=True)
else:
# For threading: run make_csv() in a thread.
processing_thread = threading.Thread(target=make_csv, args=(values["input_file"], output_csv,
values["mapping_csv"],
values["output_folder"], window))
processing_thread.start()
# Disable the submit button while make_csv() is running so users can't overwhelm computing resources
# by requesting new CSVs before the first is done being created.
window[f'{"submit"}'].update(disabled=True)
# If some of the user inputs were not correct, creates a pop up box alerting the user to the problem
# and prints the errors in the GUI dialogue box.
# The user may then edit the provided input and resubmit.
else:
sg.Popup("CSV could not be created with the provided information. See the program window for details.")
print("\nThe CSV could not be created. Please correct the following information and submit again.")
print("\n".join(errors))
window.Refresh()
# If the user clicked cancel or the X on the GUI, quites the script.
if event in ("Cancel", None):
sys.exit()