-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #385 from ror-community/dev
Merge dev to staging: CSV bulk update
- Loading branch information
Showing
28 changed files
with
1,676 additions
and
98 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import copy | ||
from datetime import datetime | ||
from rorapi.common.record_utils import * | ||
import update_address as ua | ||
from rorapi.v2.record_constants import * | ||
from rorapi.v2.serializers import ( | ||
OrganizationSerializer as OrganizationSerializerV2 | ||
) | ||
from rorapi.management.commands.generaterorid import check_ror_id | ||
|
||
V2_SCHEMA = get_file_from_url("https://raw.githubusercontent.com/ror-community/ror-schema/schema-v2/ror_schema_v2_0.json") | ||
|
||
|
||
def update_record(json_input, existing_record): | ||
record = copy.deepcopy(existing_record) | ||
for k, v in json_input.items(): | ||
record[k] = copy.deepcopy(v) | ||
return update_last_mod(record) | ||
|
||
def update_last_mod(record): | ||
record['admin']['last_modified'] = copy.deepcopy(V2_LAST_MOD) | ||
record['admin']['last_modified']['date'] = datetime.now().strftime("%Y-%m-%d") | ||
return record | ||
|
||
def check_optional_fields(record): | ||
for k in V2_OPTIONAL_FIELD_DEFAULTS: | ||
if k not in record: | ||
return True | ||
return False | ||
|
||
def add_missing_optional_fields(record): | ||
for k, v in V2_OPTIONAL_FIELD_DEFAULTS.items(): | ||
if k not in record: | ||
record[k] = v | ||
return record | ||
|
||
def add_created_last_mod(record): | ||
today = datetime.now().strftime("%Y-%m-%d") | ||
record['admin'] = copy.deepcopy(V2_ADMIN) | ||
record['admin']['created']['date'] = today | ||
record['admin']['last_modified']['date'] = today | ||
return record | ||
|
||
def update_locations(locations): | ||
error = None | ||
updated_locations = [] | ||
for location in locations: | ||
if 'geonames_id' in location: | ||
try: | ||
print(location['geonames_id']) | ||
updated_location = ua.new_geonames_v2(str(location['geonames_id'])) | ||
updated_locations.append(updated_location['location']) | ||
except: | ||
error = "Error retrieving Geonames data for ID {}. Please check that this is a valid Geonames ID".format(location['geonames_id']) | ||
return error, updated_locations | ||
|
||
def sort_list_fields(v2_record): | ||
for field in v2_record: | ||
if field in V2_SORT_KEYS: | ||
if V2_SORT_KEYS[field] is not None: | ||
sort_key = V2_SORT_KEYS[field] | ||
sorted_vals = sorted(v2_record[field], key=lambda x: x[sort_key]) | ||
else: | ||
sorted_vals = sorted(v2_record[field]) | ||
v2_record[field] = sorted_vals | ||
return v2_record | ||
|
||
|
||
def new_record_from_json(json_input, version): | ||
error = None | ||
valid_data = None | ||
new_record = copy.deepcopy(json_input) | ||
if check_optional_fields(new_record): | ||
new_record = add_missing_optional_fields(new_record) | ||
error, updated_locations = update_locations(new_record['locations']) | ||
if not error: | ||
new_record['locations'] = updated_locations | ||
new_record = add_created_last_mod(new_record) | ||
new_ror_id = check_ror_id(version) | ||
print("new ror id: " + new_ror_id) | ||
new_record['id'] = new_ror_id | ||
error, valid_data = validate_record(sort_list_fields(new_record), V2_SCHEMA) | ||
return error, valid_data | ||
|
||
|
||
def update_record_from_json(new_json, existing_org): | ||
error = None | ||
valid_data = None | ||
serializer = OrganizationSerializerV2(existing_org) | ||
existing_record = serializer.data | ||
updated_record = update_record(new_json, existing_record) | ||
error, updated_locations = update_locations(updated_record['locations']) | ||
if not error: | ||
updated_record['locations'] = updated_locations | ||
error, valid_data = validate_record(sort_list_fields(updated_record), V2_SCHEMA) | ||
return error, valid_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import csv | ||
import json | ||
import io | ||
import os | ||
import shutil | ||
import urllib | ||
from datetime import datetime | ||
from rest_framework.renderers import JSONRenderer | ||
from rorapi.settings import DATA | ||
from rorapi.v2.serializers import ( | ||
OrganizationSerializer as OrganizationSerializerV2 | ||
) | ||
from rorapi.common.csv_update import update_record_from_csv | ||
from rorapi.common.csv_create import new_record_from_csv | ||
|
||
|
||
def save_record_file(ror_id, updated, json_obj, dir_name): | ||
dir_path = os.path.join(DATA['DIR'],dir_name) | ||
if not os.path.exists(dir_path): | ||
os.makedirs(dir_path) | ||
subdir = 'updates' if updated else 'new' | ||
if not os.path.exists(os.path.join(dir_path, subdir)): | ||
os.mkdir(os.path.join(dir_path, subdir)) | ||
full_path = os.path.join(dir_path, subdir, ror_id.split('https://ror.org/')[1] + '.json') | ||
with open(full_path, "w") as outfile: | ||
json.dump(json_obj, outfile, ensure_ascii=False, indent=2) | ||
|
||
def save_report_file(report, report_fields, csv_file, dir_name, validate_only): | ||
dir_path = os.path.join(DATA['DIR'],dir_name) | ||
if not os.path.exists(dir_path): | ||
os.makedirs(dir_path) | ||
filepath = os.path.join(dir_path, 'report.csv') | ||
with open(filepath, 'w') as csvfile: | ||
writer = csv.DictWriter(csvfile, fieldnames=report_fields) | ||
writer.writeheader() | ||
writer.writerows(report) | ||
if not validate_only: | ||
# save copy of input file | ||
filepath = os.path.join(dir_path, 'input.csv') | ||
csv_file.seek(0) | ||
with open(filepath, 'wb+') as f: | ||
for chunk in csv_file.chunks(): | ||
f.write(chunk) | ||
|
||
def process_csv(csv_file, version, validate_only): | ||
print("Processing CSV") | ||
dir_name = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") + "-ror-records" | ||
success_msg = None | ||
error = None | ||
report = [] | ||
report_fields = ['row', 'ror_id', 'action', 'errors'] | ||
skipped_count = 0 | ||
updated_count = 0 | ||
new_count = 0 | ||
read_file = csv_file.read().decode('utf-8') | ||
print(read_file) | ||
reader = csv.DictReader(io.StringIO(read_file)) | ||
row_num = 2 | ||
for row in reader: | ||
ror_id = None | ||
updated = False | ||
print("Row data") | ||
print(row) | ||
if row['id']: | ||
ror_id = row['id'] | ||
updated = True | ||
row_errors, v2_record = update_record_from_csv(row, version) | ||
else: | ||
row_errors, v2_record = new_record_from_csv(row, version) | ||
if not row_errors: | ||
if updated: | ||
action = 'updated' | ||
updated_count += 1 | ||
else: | ||
action = 'created' | ||
new_count += 1 | ||
ror_id = v2_record['id'] | ||
serializer = OrganizationSerializerV2(v2_record) | ||
json_obj = json.loads(JSONRenderer().render(serializer.data)) | ||
print(json_obj) | ||
if not validate_only: | ||
#create file | ||
file = save_record_file(ror_id, updated, json_obj, dir_name) | ||
else: | ||
action = 'skipped' | ||
skipped_count += 1 | ||
if validate_only and action == 'created': | ||
ror_id = None | ||
report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": "; ".join(row_errors) if row_errors else ''}) | ||
row_num += 1 | ||
if new_count > 0 or updated_count > 0 or skipped_count > 0: | ||
try: | ||
if validate_only: | ||
try: | ||
save_report_file(report, report_fields, csv_file, dir_name, validate_only) | ||
success_msg = os.path.join(DATA['DIR'], dir_name, 'report.csv') | ||
except Exception as e: | ||
error = f"Error creating validation report: {e}" | ||
else: | ||
#create report file | ||
save_report_file(report, report_fields, csv_file, dir_name, validate_only) | ||
# create zip file | ||
zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) | ||
# upload to S3 | ||
try: | ||
DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') | ||
zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" | ||
success_msg = {"file": zipfile, | ||
"rows processed": new_count + updated_count + skipped_count, | ||
"created": new_count, | ||
"updated": updated_count, | ||
"skipped": skipped_count} | ||
except Exception as e: | ||
error = f"Error uploading zipfile to S3: {e}" | ||
except Exception as e: | ||
error = f"Unexpected error generating records: {e}" | ||
|
||
return error, success_msg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import copy | ||
from rorapi.common.record_utils import * | ||
from rorapi.common.csv_utils import * | ||
from rorapi.v2.record_constants import * | ||
from rorapi.common.serializers import ErrorsSerializer | ||
from rorapi.common.create_update import new_record_from_json | ||
|
||
|
||
def new_record_from_csv(csv_data, version): | ||
v2_data = copy.deepcopy(V2_TEMPLATE) | ||
errors = [] | ||
#domains | ||
if csv_data['domains']: | ||
v2_data['domains'] = [d.strip() for d in csv_data['domains'].strip(';').split(';')] | ||
|
||
#established | ||
if csv_data['established']: | ||
v2_data['established'] = int(csv_data['established'].strip()) | ||
|
||
#external ids | ||
for k,v in V2_EXTERNAL_ID_TYPES.items(): | ||
if csv_data['external_ids.type.' + v + '.all']: | ||
all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].strip(';').split(';')] | ||
ext_id_obj = { | ||
"type": v, | ||
"all": all_ids, | ||
"preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else all_ids[0] | ||
} | ||
v2_data['external_ids'].append(ext_id_obj) | ||
|
||
#links | ||
for k,v in V2_LINK_TYPES.items(): | ||
if csv_data['links.type.' + v]: | ||
for l in csv_data['links.type.' + v].strip(';').split(';'): | ||
link_obj = { | ||
"type": v, | ||
"value": l.strip() | ||
} | ||
v2_data['links'].append(link_obj) | ||
|
||
#locations | ||
if csv_data['locations.geonames_id']: | ||
geonames_ids = [i.strip() for i in csv_data['locations.geonames_id'].strip(';').split(';')] | ||
for geonames_id in geonames_ids: | ||
location_obj = { | ||
"geonames_id": geonames_id, | ||
"geonames_details": {} | ||
} | ||
v2_data['locations'].append(location_obj) | ||
|
||
#names | ||
temp_names = [] | ||
for k,v in V2_NAME_TYPES.items(): | ||
if csv_data['names.types.' + v]: | ||
for n in csv_data['names.types.' + v].strip(';').split(';'): | ||
if LANG_DELIMITER in n: | ||
name_val, lang = n.split("*") | ||
if lang: | ||
lang_errors, lang_code = get_lang_code(lang.strip()) | ||
if lang_errors: | ||
errors.append("Could not convert language value to ISO code: {}".format(lang)) | ||
else: | ||
name_val = n | ||
lang_code = None | ||
|
||
name_obj = { | ||
"types": [v], | ||
"value": name_val.strip(), | ||
"lang": lang_code | ||
} | ||
temp_names.append(name_obj) | ||
print("temp names 1:") | ||
print(temp_names) | ||
name_values = [n['value'] for n in temp_names] | ||
dup_names = [] | ||
for n in name_values: | ||
if name_values.count(n) > 1: | ||
if n not in dup_names: | ||
dup_names.append(n) | ||
if dup_names: | ||
dup_names_objs = [] | ||
for d in dup_names: | ||
types = [] | ||
for t in temp_names: | ||
if t['value'] == d: | ||
types.extend(t['types']) | ||
name_obj = { | ||
"types": types, | ||
"value": d, | ||
"lang": None | ||
} | ||
dup_names_objs.append(name_obj) | ||
temp_names = [t for t in temp_names if t['value'] not in dup_names] | ||
temp_names.extend(dup_names_objs) | ||
print("temp names 2:") | ||
print(temp_names) | ||
v2_data['names'] = temp_names | ||
|
||
#status | ||
if csv_data['status']: | ||
v2_data['status'] = csv_data['status'].strip().lower() | ||
|
||
#types | ||
if csv_data['types']: | ||
v2_data['types'] = [t.strip().lower() for t in csv_data['types'].strip(';').split(';')] | ||
|
||
validation_error, new_record = new_record_from_json(v2_data, version) | ||
if validation_error: | ||
errors.append(validation_error) | ||
return errors, new_record |
Oops, something went wrong.