From 99d4cd316ae62b8a57656737127f929aca2c89bf Mon Sep 17 00:00:00 2001 From: Jacob Sanford Date: Tue, 12 Dec 2023 08:42:25 -0400 Subject: [PATCH] Add CRKN processing script --- crkn/process.py | 110 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 crkn/process.py diff --git a/crkn/process.py b/crkn/process.py new file mode 100644 index 00000000..d47fba41 --- /dev/null +++ b/crkn/process.py @@ -0,0 +1,110 @@ +import os +import re +import shutil +import sys +import xml.etree.ElementTree as ET + +from pathlib import Path + +def check_metadata_structure_assertions(path): + if path == "": + print('Input path not specified.') + sys.exit(1) + cmr_file_paths = find_in_path('cmr.xml', sys.argv[1]) + for cmr_file_path in cmr_file_paths: + cmr_dir = os.path.dirname(cmr_file_path) + metadata_file_path = os.path.join(cmr_dir, 'sip/data/metadata.xml') + if not os.path.isfile(metadata_file_path): + print('No metadata.xml file found for CMR file: ' + cmr_file_path) + sys.exit(1) + return path + +def check_output_path(path): + if path == "": + print('Output path not specified.') + sys.exit(1) + if not os.path.isdir(path): + print('Output path does not exist: ' + path) + sys.exit(1) + return path + +def clean_title_for_path(title): + title = title.split(':')[0].strip() + title = re.sub(r"\[.*\]", "", title).strip() + title = re.sub('[^0-9a-zA-Z]+', ' ', title).strip() + title = re.sub('\s\s+', ' ', title).strip() + title = title.replace(' ', '_').title() + return title + +def find_images_in_path(path): + result = [] + for root, dirs, files in os.walk(path): + for file_found in files: + if file_found.lower().endswith(( + '.png', + '.tif', + '.tiff', + '.jpeg', + '.jpg', + '.jp2' + )): + result.append(os.path.join(root, file_found)) + return result + +def find_in_path(name, path): + result = [] + for root, dirs, files in os.walk(path): + if name in files: + result.append(os.path.join(root, name)) + return result + +def generate_issue_path(metadata_file_path, issue_metadata, output_path): + cleaned_title=clean_title_for_path(issue_metadata['title']) + series = issue_metadata['series'] + sequence = issue_metadata['sequence'] + year = issue_metadata['published'] + return os.path.join(output_path, cleaned_title, year, series, sequence) + +def get_metadata_file_marker_path(metadata_file_path): + return metadata_file_path.replace('.xml', '.xml.processed') + +def mark_metadata_file_as_processed(metadata_file_path): + marker_file_path = get_metadata_file_marker_path(metadata_file_path) + Path(marker_file_path).touch() + +def metadata_file_needs_processing(metadata_file_path): + marker_file_path = get_metadata_file_marker_path(metadata_file_path) + if os.path.isfile(marker_file_path): + return False + return True + +## Main +source_path = check_metadata_structure_assertions(sys.argv[1]) +output_path = check_output_path(sys.argv[2]) + +for metadata_file_path in find_in_path('metadata.xml', source_path): + if not metadata_file_needs_processing(metadata_file_path): + print("Skipping: " + metadata_file_path + "...") + continue + print("Processing: " + metadata_file_path + "...") + tree = ET.parse(metadata_file_path) + root = tree.getroot() + for item in root.findall('.//{http://canadiana.ca/schema/2012/xsd/issueinfo}issueinfo'): + issue_metadata = {} + for metadata_item in item: + issue_metadata[ + metadata_item.tag.replace( + '{http://canadiana.ca/schema/2012/xsd/issueinfo}', + '' + ) + ] = metadata_item.text + issue_path = generate_issue_path(metadata_file_path, issue_metadata, output_path) + Path(issue_path).mkdir(parents=True, exist_ok=True) + copy_source = metadata_file_path.replace('/data/sip/data/metadata.xml','') + for image in find_images_in_path(copy_source): + shutil.copy2(image, issue_path) + shutil.copy2(metadata_file_path, issue_path) + cmr_file_path = os.path.join(copy_source, 'data/cmr.xml') + if os.path.isfile(cmr_file_path): + shutil.copy2(cmr_file_path, issue_path) + mark_metadata_file_as_processed(metadata_file_path)