Skip to content

Commit

Permalink
Merge pull request #385 from ror-community/dev
Browse files Browse the repository at this point in the history
Merge dev to staging: CSV bulk update
  • Loading branch information
lizkrznarich authored Apr 1, 2024
2 parents 224e36e + c777623 commit 8d14fc8
Show file tree
Hide file tree
Showing 28 changed files with 1,676 additions and 98 deletions.
14 changes: 13 additions & 1 deletion .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ jobs:
uses: actions/checkout@v2
with:
path: ror-api
- name: Checkout ror-data-test
uses: actions/checkout@v2
with:
repository: ror-community/ror-data-test
token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
path: ror-data-test
- name: Get last data dump name
working-directory: ./ror-data-test
run: |
FILE="$(ls -Art *.zip | tail -n 1)"
echo ${FILE%.*}
echo "LATEST_DUMP_FILE=${FILE%.*}" >> $GITHUB_ENV
- name: Cache dependency
uses: actions/cache@v2
with:
Expand All @@ -57,7 +69,7 @@ jobs:
- name: Setup
working-directory: ./ror-api
run: |
python manage.py setup v1.35-2023-10-26-ror-data -t
python manage.py setup v1.42-2024-02-21-ror-data -t
# Dump file temp hard coded for v2 beta
# Pulled from ror-data-test per settings.py config
- name: Test
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ RUN mv /etc/apt/sources.list.d /etc/apt/sources.list.d.bak && \
mv /etc/apt/sources.list.d.bak /etc/apt/sources.list.d && \
apt-get upgrade -y -o Dpkg::Options::="--force-confold" && \
apt-get clean && \
apt-get install ntp wget unzip tzdata python3-pip -y && \
apt-get install ntp wget unzip tzdata python3-pip libmagic1 -y && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

# Enable Passenger and Nginx and remove the default site
Expand Down
168 changes: 166 additions & 2 deletions README.md

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,8 @@ boto3
pandas==1.4.1
numpy==1.22
titlecase==2.3
update_address @ git+https://github.com/ror-community/update_address.git
launchdarkly-server-sdk
update_address @ git+https://github.com/ror-community/update_address.git@v2-locations
launchdarkly-server-sdk==7.6.1
jsonschema==3.2.0
python-magic
iso639-lang
96 changes: 96 additions & 0 deletions rorapi/common/create_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import copy
from datetime import datetime
from rorapi.common.record_utils import *
import update_address as ua
from rorapi.v2.record_constants import *
from rorapi.v2.serializers import (
OrganizationSerializer as OrganizationSerializerV2
)
from rorapi.management.commands.generaterorid import check_ror_id

V2_SCHEMA = get_file_from_url("https://raw.githubusercontent.com/ror-community/ror-schema/schema-v2/ror_schema_v2_0.json")


def update_record(json_input, existing_record):
record = copy.deepcopy(existing_record)
for k, v in json_input.items():
record[k] = copy.deepcopy(v)
return update_last_mod(record)

def update_last_mod(record):
record['admin']['last_modified'] = copy.deepcopy(V2_LAST_MOD)
record['admin']['last_modified']['date'] = datetime.now().strftime("%Y-%m-%d")
return record

def check_optional_fields(record):
for k in V2_OPTIONAL_FIELD_DEFAULTS:
if k not in record:
return True
return False

def add_missing_optional_fields(record):
for k, v in V2_OPTIONAL_FIELD_DEFAULTS.items():
if k not in record:
record[k] = v
return record

def add_created_last_mod(record):
today = datetime.now().strftime("%Y-%m-%d")
record['admin'] = copy.deepcopy(V2_ADMIN)
record['admin']['created']['date'] = today
record['admin']['last_modified']['date'] = today
return record

def update_locations(locations):
error = None
updated_locations = []
for location in locations:
if 'geonames_id' in location:
try:
print(location['geonames_id'])
updated_location = ua.new_geonames_v2(str(location['geonames_id']))
updated_locations.append(updated_location['location'])
except:
error = "Error retrieving Geonames data for ID {}. Please check that this is a valid Geonames ID".format(location['geonames_id'])
return error, updated_locations

def sort_list_fields(v2_record):
for field in v2_record:
if field in V2_SORT_KEYS:
if V2_SORT_KEYS[field] is not None:
sort_key = V2_SORT_KEYS[field]
sorted_vals = sorted(v2_record[field], key=lambda x: x[sort_key])
else:
sorted_vals = sorted(v2_record[field])
v2_record[field] = sorted_vals
return v2_record


def new_record_from_json(json_input, version):
error = None
valid_data = None
new_record = copy.deepcopy(json_input)
if check_optional_fields(new_record):
new_record = add_missing_optional_fields(new_record)
error, updated_locations = update_locations(new_record['locations'])
if not error:
new_record['locations'] = updated_locations
new_record = add_created_last_mod(new_record)
new_ror_id = check_ror_id(version)
print("new ror id: " + new_ror_id)
new_record['id'] = new_ror_id
error, valid_data = validate_record(sort_list_fields(new_record), V2_SCHEMA)
return error, valid_data


def update_record_from_json(new_json, existing_org):
error = None
valid_data = None
serializer = OrganizationSerializerV2(existing_org)
existing_record = serializer.data
updated_record = update_record(new_json, existing_record)
error, updated_locations = update_locations(updated_record['locations'])
if not error:
updated_record['locations'] = updated_locations
error, valid_data = validate_record(sort_list_fields(updated_record), V2_SCHEMA)
return error, valid_data
118 changes: 118 additions & 0 deletions rorapi/common/csv_bulk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import csv
import json
import io
import os
import shutil
import urllib
from datetime import datetime
from rest_framework.renderers import JSONRenderer
from rorapi.settings import DATA
from rorapi.v2.serializers import (
OrganizationSerializer as OrganizationSerializerV2
)
from rorapi.common.csv_update import update_record_from_csv
from rorapi.common.csv_create import new_record_from_csv


def save_record_file(ror_id, updated, json_obj, dir_name):
dir_path = os.path.join(DATA['DIR'],dir_name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
subdir = 'updates' if updated else 'new'
if not os.path.exists(os.path.join(dir_path, subdir)):
os.mkdir(os.path.join(dir_path, subdir))
full_path = os.path.join(dir_path, subdir, ror_id.split('https://ror.org/')[1] + '.json')
with open(full_path, "w") as outfile:
json.dump(json_obj, outfile, ensure_ascii=False, indent=2)

def save_report_file(report, report_fields, csv_file, dir_name, validate_only):
dir_path = os.path.join(DATA['DIR'],dir_name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
filepath = os.path.join(dir_path, 'report.csv')
with open(filepath, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=report_fields)
writer.writeheader()
writer.writerows(report)
if not validate_only:
# save copy of input file
filepath = os.path.join(dir_path, 'input.csv')
csv_file.seek(0)
with open(filepath, 'wb+') as f:
for chunk in csv_file.chunks():
f.write(chunk)

def process_csv(csv_file, version, validate_only):
print("Processing CSV")
dir_name = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") + "-ror-records"
success_msg = None
error = None
report = []
report_fields = ['row', 'ror_id', 'action', 'errors']
skipped_count = 0
updated_count = 0
new_count = 0
read_file = csv_file.read().decode('utf-8')
print(read_file)
reader = csv.DictReader(io.StringIO(read_file))
row_num = 2
for row in reader:
ror_id = None
updated = False
print("Row data")
print(row)
if row['id']:
ror_id = row['id']
updated = True
row_errors, v2_record = update_record_from_csv(row, version)
else:
row_errors, v2_record = new_record_from_csv(row, version)
if not row_errors:
if updated:
action = 'updated'
updated_count += 1
else:
action = 'created'
new_count += 1
ror_id = v2_record['id']
serializer = OrganizationSerializerV2(v2_record)
json_obj = json.loads(JSONRenderer().render(serializer.data))
print(json_obj)
if not validate_only:
#create file
file = save_record_file(ror_id, updated, json_obj, dir_name)
else:
action = 'skipped'
skipped_count += 1
if validate_only and action == 'created':
ror_id = None
report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": "; ".join(row_errors) if row_errors else ''})
row_num += 1
if new_count > 0 or updated_count > 0 or skipped_count > 0:
try:
if validate_only:
try:
save_report_file(report, report_fields, csv_file, dir_name, validate_only)
success_msg = os.path.join(DATA['DIR'], dir_name, 'report.csv')
except Exception as e:
error = f"Error creating validation report: {e}"
else:
#create report file
save_report_file(report, report_fields, csv_file, dir_name, validate_only)
# create zip file
zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name)
# upload to S3
try:
DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip')
zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip"
success_msg = {"file": zipfile,
"rows processed": new_count + updated_count + skipped_count,
"created": new_count,
"updated": updated_count,
"skipped": skipped_count}
except Exception as e:
error = f"Error uploading zipfile to S3: {e}"
except Exception as e:
error = f"Unexpected error generating records: {e}"

return error, success_msg
110 changes: 110 additions & 0 deletions rorapi/common/csv_create.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import copy
from rorapi.common.record_utils import *
from rorapi.common.csv_utils import *
from rorapi.v2.record_constants import *
from rorapi.common.serializers import ErrorsSerializer
from rorapi.common.create_update import new_record_from_json


def new_record_from_csv(csv_data, version):
v2_data = copy.deepcopy(V2_TEMPLATE)
errors = []
#domains
if csv_data['domains']:
v2_data['domains'] = [d.strip() for d in csv_data['domains'].strip(';').split(';')]

#established
if csv_data['established']:
v2_data['established'] = int(csv_data['established'].strip())

#external ids
for k,v in V2_EXTERNAL_ID_TYPES.items():
if csv_data['external_ids.type.' + v + '.all']:
all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].strip(';').split(';')]
ext_id_obj = {
"type": v,
"all": all_ids,
"preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else all_ids[0]
}
v2_data['external_ids'].append(ext_id_obj)

#links
for k,v in V2_LINK_TYPES.items():
if csv_data['links.type.' + v]:
for l in csv_data['links.type.' + v].strip(';').split(';'):
link_obj = {
"type": v,
"value": l.strip()
}
v2_data['links'].append(link_obj)

#locations
if csv_data['locations.geonames_id']:
geonames_ids = [i.strip() for i in csv_data['locations.geonames_id'].strip(';').split(';')]
for geonames_id in geonames_ids:
location_obj = {
"geonames_id": geonames_id,
"geonames_details": {}
}
v2_data['locations'].append(location_obj)

#names
temp_names = []
for k,v in V2_NAME_TYPES.items():
if csv_data['names.types.' + v]:
for n in csv_data['names.types.' + v].strip(';').split(';'):
if LANG_DELIMITER in n:
name_val, lang = n.split("*")
if lang:
lang_errors, lang_code = get_lang_code(lang.strip())
if lang_errors:
errors.append("Could not convert language value to ISO code: {}".format(lang))
else:
name_val = n
lang_code = None

name_obj = {
"types": [v],
"value": name_val.strip(),
"lang": lang_code
}
temp_names.append(name_obj)
print("temp names 1:")
print(temp_names)
name_values = [n['value'] for n in temp_names]
dup_names = []
for n in name_values:
if name_values.count(n) > 1:
if n not in dup_names:
dup_names.append(n)
if dup_names:
dup_names_objs = []
for d in dup_names:
types = []
for t in temp_names:
if t['value'] == d:
types.extend(t['types'])
name_obj = {
"types": types,
"value": d,
"lang": None
}
dup_names_objs.append(name_obj)
temp_names = [t for t in temp_names if t['value'] not in dup_names]
temp_names.extend(dup_names_objs)
print("temp names 2:")
print(temp_names)
v2_data['names'] = temp_names

#status
if csv_data['status']:
v2_data['status'] = csv_data['status'].strip().lower()

#types
if csv_data['types']:
v2_data['types'] = [t.strip().lower() for t in csv_data['types'].strip(';').split(';')]

validation_error, new_record = new_record_from_json(v2_data, version)
if validation_error:
errors.append(validation_error)
return errors, new_record
Loading

0 comments on commit 8d14fc8

Please sign in to comment.