From 2eb68f4e64fd5097b745d8b405ce49701b5fc2c8 Mon Sep 17 00:00:00 2001 From: Karolina Przerwa Date: Thu, 24 Oct 2024 18:41:03 +0200 Subject: [PATCH] rules: implement more fields --- cds_migrator_kit/migration_config.py | 7 +- cds_migrator_kit/rdm/migration/load/load.py | 12 +-- cds_migrator_kit/rdm/migration/streams.yaml | 2 +- .../transform/models/summer_student_report.py | 45 ++++----- .../rdm/migration/transform/transform.py | 99 +++++++++++-------- .../xml_processing/quality/contributors.py | 16 ++- .../transform/xml_processing/rules/base.py | 91 +++++++---------- .../transform/xml_processing/rules/people.py | 1 - .../rules/summer_student_report.py | 54 +++++++++- scripts/copy_collection_files.py | 27 ++--- 10 files changed, 200 insertions(+), 154 deletions(-) diff --git a/cds_migrator_kit/migration_config.py b/cds_migrator_kit/migration_config.py index 9123dc9..5b7eaf9 100644 --- a/cds_migrator_kit/migration_config.py +++ b/cds_migrator_kit/migration_config.py @@ -388,7 +388,7 @@ def _(x): # needed to avoid start time failure with lazy strings CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/migration/streams.yaml" -from invenio_rdm_records.config import RDM_RECORDS_IDENTIFIERS_SCHEMES, always_valid +from invenio_rdm_records.config import RDM_RECORDS_IDENTIFIERS_SCHEMES, always_valid, RDM_RECORDS_PERSONORG_SCHEMES from cds_rdm import schemes RDM_RECORDS_IDENTIFIERS_SCHEMES = {**RDM_RECORDS_IDENTIFIERS_SCHEMES, @@ -402,6 +402,11 @@ def _(x): # needed to avoid start time failure with lazy strings "validator": schemes.is_inspire, "datacite": "INSPIRE"}}} +RDM_RECORDS_PERSONORG_SCHEMES = {**RDM_RECORDS_PERSONORG_SCHEMES, + **{"inspire": {"label": _("Inspire"), + "validator": schemes.is_inspire, + "datacite": "INSPIRE"}}} + CDS_MIGRATOR_KIT_RECORD_STATS_STREAM_CONFIG = dict( ####### Search ############## diff --git a/cds_migrator_kit/rdm/migration/load/load.py b/cds_migrator_kit/rdm/migration/load/load.py index 96b4b96..6bd51a4 100644 --- a/cds_migrator_kit/rdm/migration/load/load.py +++ b/cds_migrator_kit/rdm/migration/load/load.py @@ -155,21 +155,19 @@ def _load_versions(self, entry, logger): """Load other versions of the record.""" versions = entry["versions"] legacy_recid = entry["record"]["recid"] - record = entry["record"] - parent = entry["parent"] def publish_and_mint_recid(draft, version): - record = current_rdm_records_service.publish(system_identity, draft["id"]) + record_item = current_rdm_records_service.publish(system_identity, draft["id"]) # mint legacy ids for redirections if version == 1: - record._record.model.created = arrow.get( + record_item._record.model.created = arrow.get( entry["record"]["created"] ).datetime - record._record.commit() + record_item._record.commit() # it seems more intuitive if we mint the lrecid for parent # but then we get a double redirection - legacy_recid_minter(legacy_recid, record._record.parent.model.id) - return record + legacy_recid_minter(legacy_recid, record_item._record.parent.model.id) + return record_item identity = system_identity # TODO: load users instead ? diff --git a/cds_migrator_kit/rdm/migration/streams.yaml b/cds_migrator_kit/rdm/migration/streams.yaml index 44c180e..fdb0777 100644 --- a/cds_migrator_kit/rdm/migration/streams.yaml +++ b/cds_migrator_kit/rdm/migration/streams.yaml @@ -16,4 +16,4 @@ records: transform: files_dump_dir: cds_migrator_kit/rdm/migration/data/summer_student_reports/files/ missing_users: cds_migrator_kit/rdm/migration/data/users - community_id: 63448ca7-c814-4716-b099-a39766df6dbb + community_id: 94ae20a9-57f1-4a4e-bcac-4ef802b88e5d diff --git a/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py b/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py index 013cc50..d715303 100644 --- a/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py +++ b/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py @@ -33,20 +33,17 @@ class CMSSummerStudent(CdsOverdo): __ignore_keys__ = { # decided to ignore - # "0247_2", # DOI, summer student notes do not have it - # "0247_a", # DOI "0248_a", # oai identifier, not needed to migrate, TBD "0248_p", # oai identifier, not needed to migrate, TBD - "0248_q", # does appear in data, what is this field recid 2778897 + "0248_q", # full text tag 2778897 "100__m", # author's email <-- decided not to keep in RDM, "260__c", # Redundant (more detailed value is in 269__c imprint.pub_date) "270__m", # document contact email "595__a", # always value CERN EDS, not displayed, TODO: do we keep? "595__z", # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean + "700__m", # author's email <-- decided not to keep in RDM, "710__5", # department / organisation author "710__a", # organisation author - "700__m", # Contributors (email) - "700__m", # author's email <-- decided not to keep in RDM, "8564_8", # Files system field "8564_s", # Files system field "8564_u", # Files @@ -56,22 +53,18 @@ class CMSSummerStudent(CdsOverdo): "937__s", # modification person "960__a", # collection id? usually value 12, to confirm if we ignore "980__a", # collection tag - # "980__c", # MIGRATED/DELETED - it shouldn't even make it here - - # TO Implement (to remove from here) - "690C_a", # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field) - # "562__c", # note - # "700__0", # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID - # "693__b", # beams recid: 2640381 - # TO DECIDE # IMPLEMENTED # "001" # "003" + # "035__9", # Inspire schema + # "035__a", # Inspire id value + # "037__a", # (Report number) alternative identifiers -> scheme "CDS REFERENCE" # "041__a", # languages + # "088__a", # RN (manual introduced?) second report number (so the identifiers schemas are not unique!) + # "100__9", # #BEARD# tag # "100__a", # "100__u", # Author affiliation - # "100__9", # #BEARD# tag # "246__a", # "246__i", # abbreviation # "246__i", # abbreviation tag, applies to value of 246__A @@ -83,30 +76,32 @@ class CMSSummerStudent(CdsOverdo): # "65017a", # subject value # "6531_9", # keyword provenance # "6531_a", # keyword + # "690C_a", # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field) # "6931_9", # keyword # "6931_a", # keyword + # "693__a", # accelerator, do we create a custom field? + # "693__b", # beams recid: 2640381 # "693__e", # custom_fields.cern:experiments + # "693__f", # facility, do we create a custom field? + # "693__p", # project, do we create a custom field? + # "693__s", # study, do we create a custom field? + # "700__0", # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID + # "700__9", # #BEARD# tag # "700__a", # Contributors (full name) # "700__u", # Contributors (affiliation) # "710__g", # Collaboration, OK to migrate as corporate contributor (not creator)? - # "700__9", # #BEARD# tag # "859__f", # creator's email, to be used to determine the owner + # "906__p", # names, is it supervisor? # "916__n", # "916__s", # "916__w", # "963__a", - # "693__a", # accelerator, do we create a custom field? - # "693__f", # facility, do we create a custom field? - # "693__p", # project, do we create a custom field? - # "693__s", # study, do we create a custom field? - # "906__p", # names, is it supervisor? # "970__a", # alternative identifier, scheme ALEPH - # "037__a", # (Report number) alternative identifiers -> scheme "CDS REFERENCE" - # "088__a", # RN (manual introduced?) second report number (so the identifiers schemas are not unique!) - # "035__9", # Inspire schema - # "035__a", # Inspire id value + "269__a", # imprint place # TODO + } + _default_fields = { + "resource_type": {"id": "publication-technicalnote"} } - _default_fields = None model = CMSSummerStudent( diff --git a/cds_migrator_kit/rdm/migration/transform/transform.py b/cds_migrator_kit/rdm/migration/transform/transform.py index fd90598..50cc2dd 100644 --- a/cds_migrator_kit/rdm/migration/transform/transform.py +++ b/cds_migrator_kit/rdm/migration/transform/transform.py @@ -84,9 +84,6 @@ def _recid(self, record_dump): """Returns the recid of the record.""" return str(record_dump.data["recid"]) - def _pids(self, json_entry): - return {} - def _bucket_id(self, json_entry): return @@ -196,62 +193,85 @@ def get_person_old_db(email): return user.id def _metadata(self, json_entry): - def creators(json, key="creators"): - _creators = deepcopy(json.get(key, [])) + + def affiliations(creator): vocab_type = "affiliations" service = current_service_registry.get(vocab_type) extra_filter = dsl.Q("term", type__id=vocab_type) - _creators = list(filter(lambda x: x is not None, _creators)) - for creator in _creators: - affiliations = creator.get("affiliations", []) - transformed_aff = [] - for affiliation_name in affiliations: + affiliations = creator.get("affiliations", []) + transformed_aff = [] - title = dsl.Q("match", **{f"title": affiliation_name}) - acronym = dsl.Q( - "match_phrase", **{f"acronym.keyword": affiliation_name} - ) - title_filter = dsl.query.Bool("should", should=[title, acronym]) - vocabulary_result = service.search( - system_identity, extra_filter=title_filter | extra_filter - ).to_dict() - if vocabulary_result["hits"]["total"]: - transformed_aff.append( - { - "name": affiliation_name, - "id": vocabulary_result["hits"]["hits"][0]["id"], - } - ) - else: - raise UnexpectedValue( - subfield="u", - value=affiliation_name, - field="author", - message=f"Affiliation {affiliation_name} not found.", - stage="vocabulary match", - ) + for affiliation_name in affiliations: + + title = dsl.Q("match", **{f"title": affiliation_name}) + acronym = dsl.Q( + "match_phrase", **{f"acronym.keyword": affiliation_name} + ) + title_filter = dsl.query.Bool("should", should=[title, acronym]) + + vocabulary_result = service.search( + system_identity, extra_filter=title_filter | extra_filter + ).to_dict() + if vocabulary_result["hits"]["total"]: + transformed_aff.append( + { + "name": affiliation_name, + "id": vocabulary_result["hits"]["hits"][0]["id"], + } + ) + else: + raise UnexpectedValue( + subfield="u", + value=affiliation_name, + field="author", + message=f"Affiliation {affiliation_name} not found.", + stage="vocabulary match", + ) creator["affiliations"] = transformed_aff + + def creator_identifiers(creator): + processed_identifiers = [] + inner_dict = creator.get("person_or_org", {}) + identifiers = inner_dict.get("identifiers", []) + for identifier in identifiers: + # TODO process CDS and CERN Ids when names vocabulary ready + if identifier["scheme"] == "inspire": + processed_identifiers.append(identifier) + if processed_identifiers: + + inner_dict["identifiers"] = processed_identifiers + else: + inner_dict.pop("identifiers", None) + + def creators(json, key="creators"): + _creators = deepcopy(json.get(key, [])) + _creators = list(filter(lambda x: x is not None, _creators)) + for creator in _creators: + affiliations(creator) + creator_identifiers(creator) return _creators - def _resource_type(data): - t = "publication-technicalnote" - st = None - return {"id": f"{t}-{st}"} if st else {"id": t} + def _resource_type(entry): + return entry["resource_type"] - return { + metadata = { "creators": creators(json_entry), "title": json_entry["title"], "resource_type": _resource_type(json_entry), "description": json_entry.get("description"), "publication_date": json_entry.get("publication_date"), "contributors": creators(json_entry, key="contributors"), - "notes": json_entry.get("internal_notes"), "subjects": json_entry.get("subjects"), "publisher": json_entry.get("publisher"), "additional_descriptions": json_entry.get("additional_descriptions"), "identifiers": json_entry.get("identifiers"), + "languages": json_entry.get("languages"), + "_internal_notes": json_entry.get("internal_notes"), + # "imprint": json_entry.get("imprint"), # TODO } + # filter empty keys + return {k: v for k, v in metadata.items() if v} def _custom_fields(self, json_entry): @@ -384,7 +404,6 @@ def transform(self, entry): json_output = { "created": self._created(json_data), "updated": self._updated(record_dump), - "pids": self._pids(json_data), "files": self._files(record_dump), "metadata": self._metadata(json_data), } diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py index b4de792..a403f89 100644 --- a/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py +++ b/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py @@ -131,7 +131,6 @@ def get_contributor_role(subfield, role, raise_unexpected=False): def get_contributor_affiliations(info): - aff_results = [] u = info.get("u", "") if not u: return @@ -155,15 +154,12 @@ def extract_json_contributor_ids(info): author_ids = force_list(info.get("0", "")) for author_id in author_ids: match = regex.match(author_id) - # if match: - # ids.append( - # {"identifier": match.group(3), "scheme": SOURCES[match.group(1)]} - # ) - # pass - try: - ids.append({"identifier": info["inspireid"], "scheme": "inspire"}) - except KeyError: - pass + if match: + identifier = match.group(3) + identifier = identifier.replace("INSPIRE-", "") + ids.append( + {"identifier": identifier, "scheme": SOURCES[match.group(1)]} + ) author_orcid = info.get("k") if author_orcid: diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py index 861e139..17f04b5 100644 --- a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py @@ -80,13 +80,15 @@ def created(self, key, value): @model.over("title", "^245__") def title(self, key, value): """Translates title.""" - return value.get("a", "TODO") + title = StringValue(value.get("a")) + title.required() + return title.parse() @model.over("description", "^520__") def description(self, key, value): """Translates description.""" - description_text = value.get("a") + description_text = StringValue(value.get("a")).parse() return description_text @@ -97,9 +99,9 @@ def description(self, key, value): def additional_descriptions(self, key, value): """Translates additional description.""" description_text = value.get("a") - + _additional_description = {} if key == "500__": - additional_description = { + _additional_description = { "description": description_text, "type": { "id": "other", # what's with the lang @@ -111,54 +113,15 @@ def additional_descriptions(self, key, value): _abbreviations.append(description_text) if is_abbreviation: - additional_description = { + _additional_description = { "description": "Abbreviations: " + "; ".join(_abbreviations), "type": { "id": "other", # what's with the lang }, } - - return additional_description - - -def publisher(self, key, value): - """Translates publisher.""" - publisher = value.get("b") - if publisher: - self["publisher"] = publisher - else: - raise IgnoreKey("publisher") - - -def publication_date(self, key, value): - """Translates publication_date.""" - publication_date_str = value.get("c") - try: - date_obj = parse(publication_date_str) - self["publication_date"] = date_obj.strftime("%Y-%m-%d") - return - except ParserError: - raise UnexpectedValue( - field="publication_date", - message=f"Can't parse provided publication date. Value: {publication_date_str}", - ) - - -@model.over("imprint", "^269__") -def imprint(self, key, value): - """Translates imprint - WARNING - also publisher and publication_date.""" - - imprint = { - "place": value.get("a"), - } - - if not self.get("publication_date"): - publication_date(self, key, value) - - if not self.get("publisher"): - publisher(self, key, value) - - return imprint + if _additional_description: + return _additional_description + raise IgnoreKey("additional_descriptions") @model.over("creators", "^100__") @@ -166,7 +129,9 @@ def imprint(self, key, value): @require(["a"]) def creators(self, key, value): """Translates the creators field.""" - role = get_contributor_role("e", value.get("e", "author")) + role = value.get("e") + if role: + role = get_contributor_role("e", role) beard = value.get("9") if beard is not None and beard != "#BEARD#": # checking if anything else stored in this field @@ -182,8 +147,9 @@ def creators(self, key, value): } } if role: - contributor.update({"role": {"id": role}}) # VOCABULARY ID - else: + contributor.update({"role": {"id": role}}) + elif not role and key == "700__": + # creator does not require role, so if the key == 100 role can be skipped contributor.update({"role": {"id": "other"}}) if affiliations: @@ -193,7 +159,6 @@ def creators(self, key, value): @model.over("contributors", "^700__") -# @for_each_value @require(["a"]) def contributors(self, key, value): """Translates contributors.""" @@ -210,7 +175,7 @@ def languages(self, key, value): if lang: lang = lang.lower() try: - return pycountry.languages.lookup(lang).alpha_3.upper() + return {"id": pycountry.languages.lookup(lang).alpha_3.lower()} except (KeyError, AttributeError, LookupError): raise UnexpectedValue(field=key, subfield="a") @@ -246,7 +211,6 @@ def subjects(self, key, value): @model.over("custom_fields", "(^693__)") def custom_fields(self, key, value): """Translates custom fields.""" - _custom_fields = self.get("custom_fields", {}) experiments, accelerators, projects, facilities, studies = [], [], [], [], [] if key == "693__": @@ -260,6 +224,15 @@ def custom_fields(self, key, value): facilities += [StringValue(v).parse() for v in force_list(value.get("f"))] if "s" in value and value.get("s"): studies += [StringValue(v).parse() for v in force_list(value.get("s"))] + if "b" in value and value.get("b"): + # migrates beams field to subjects/keywords + _subjects = self.get("subjects", []) + subject_value = StringValue(value.get("a")).parse() + subject = { + "subject": subject_value, + } + _subjects.append(subject) + raise IgnoreKey("custom_fields") _custom_fields["cern:experiments"] = experiments _custom_fields["cern:accelerators"] = accelerators @@ -271,11 +244,13 @@ def custom_fields(self, key, value): @model.over("submitter", "(^859__)") def record_submitter(self, key, value): + """Translate record submitter.""" return value.get("f") @model.over("record_restriction", "(^963__)") def record_restriction(self, key, value): + """Translate record restriction field.""" restr = value.get("a") parsed = StringValue(restr).parse() if parsed == "PUBLIC": @@ -300,6 +275,11 @@ def report_number(self, key, value): @model.over("identifiers", "^970__") @for_each_value def aleph_number(self, key, value): + """Translates identifiers: ALEPH. + + Attention: 035 might contain aleph number + https://github.com/CERNDocumentServer/cds-migrator-kit/issues/21 + """ aleph = StringValue(value.get("a")).parse() if aleph: return {"scheme": "aleph", "identifier": aleph} @@ -308,6 +288,11 @@ def aleph_number(self, key, value): @model.over("identifiers", "^035__") @for_each_value def inspire_number(self, key, value): + """Translates identifiers. + + Attention: might contain aleph number + https://github.com/CERNDocumentServer/cds-migrator-kit/issues/21 + """ id_value = StringValue(value.get("a")).parse() scheme = StringValue(value.get("9")).parse() diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/people.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/people.py index eae33ab..eb98e96 100644 --- a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/people.py +++ b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/people.py @@ -21,7 +21,6 @@ from cds_dojson.marc21.fields.utils import clean_val, out_strip from dojson.utils import force_list -from ..quality.decorators import for_each_value # ATTENTION when COPYING! important which model you use as decorator from ...models.people import model diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/summer_student_report.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/summer_student_report.py index e90d96f..54fb2e7 100644 --- a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/summer_student_report.py +++ b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/summer_student_report.py @@ -17,9 +17,9 @@ # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Common RDM fields.""" - -from cds_dojson.marc21.fields.utils import clean_val, out_strip +from dateutil.parser import ParserError, parse from dojson.errors import IgnoreKey +from dojson.utils import force_list from ..errors import UnexpectedValue, MissingRequiredField from ..quality.decorators import for_each_value @@ -31,6 +31,7 @@ @model.over("contributors", "^270__") @for_each_value def contact_person(self, key, value): + """Translates contact person.""" contributor = { "person_or_org": { "type": "personal", @@ -45,7 +46,8 @@ def contact_person(self, key, value): @model.over("contributors", "^906__") @for_each_value def supervisor(self, key, value): - supervisor = StringValue(value.get("p")) + """Translates supervisor.""" + supervisor = StringValue(value.get("p")).parse() if not supervisor: raise MissingRequiredField(field=key, subfield="p", priority="warning") @@ -64,6 +66,7 @@ def supervisor(self, key, value): @model.over("contributors", "^710__") @for_each_value def corporate_author(self, key, value): + """Translates corporate author.""" if "g" in value: contributor = { "person_or_org": { @@ -77,10 +80,51 @@ def corporate_author(self, key, value): if "5" in value: department = StringValue(value.get("5")).parse() self.get("custom_fields", {}).get("cern:departments", []).append(department) - raise IgnoreKey + raise IgnoreKey("contributors") @model.over("internal_notes", "^562__") @for_each_value def note(self, key, value): - return StringValue(value.get("c")).parse() + """Translates notes""" + return {"note": StringValue(value.get("c")).parse()} + + +@model.over("custom_fields", "^690C_") +def department(self, key, value): + """Translates department.""" + values = force_list(value.get("a")) + for v in values: + if "PUBL" in v: + department = v.replace("PUBL", "").strip() + departments = self.get("custom_fields", {}).get("cern:departments", []) + if department not in departments: + departments.append(department) + raise IgnoreKey("custom_fields") + + +@model.over("publication_date", "^269__") +def imprint_info(self, key, value): + """Translates imprint - WARNING - also publisher and publication_date. + + In case of summer student notes this field contains only date + but it needs to be reimplemented for the base set of rules - + it will contain also imprint place + """ + publication_date_str = value.get("c") + _publisher = value.get("b") + + if _publisher and not self.get("publisher"): + self["publisher"] = _publisher + + try: + date_obj = parse(publication_date_str) + return date_obj.strftime("%Y-%m-%d") + except ParserError: + raise UnexpectedValue( + field=key, + value=value, + message=f"Can't parse provided publication date. Value: {publication_date_str}", + ) + + diff --git a/scripts/copy_collection_files.py b/scripts/copy_collection_files.py index f681200..23aa76c 100644 --- a/scripts/copy_collection_files.py +++ b/scripts/copy_collection_files.py @@ -2,17 +2,19 @@ import json import os import shutil - +import io def copy_collection_file(dump_files, destination_prefix, working_dir): - file_log = open(os.path.join(working_dir, "files.log"), "w") + file_log = open(os.path.join(working_dir, "files.log"), "wb") for dump_file in dump_files: - with open(os.path.join(working_dir, dump_file), "r") as json_dump: + with open(dump_file, "r") as json_dump: data = json.load(json_dump) for record in data: legacy_record_files = record["files"] + recid = record["recid"] for legacy_record_file in legacy_record_files: + print("Processing {}".format(recid)) full_path = legacy_record_file["full_path"] # important: last slash path_to_replace = "/opt/cdsweb/var/data/files/" @@ -22,14 +24,17 @@ def copy_collection_file(dump_files, destination_prefix, working_dir): parent_dest_path = os.path.dirname(destination_path) if not os.path.exists(parent_dest_path): os.makedirs(parent_dest_path) - shutil.copy(full_path, destination_path) - file_log.writelines( - [ - f"RECID: {record['recid']}," - f" bibdocid: {legacy_record_file['bibdocid']}" - f" file: {legacy_record_file['full_name']}," - f" destination: {destination_path}" - ] + if not os.path.exists(destination_path): + shutil.copy(full_path, destination_path) + + filename = legacy_record_file['full_name'].encode("utf-8") + file_log.write( + u"RECID: %s bibdocid: %s file: %s, destination: %s \n" % ( + record['recid'], + legacy_record_file['bibdocid'], + filename, + destination_path.encode("utf-8") + ) ) file_log.close()