From 2eb68f4e64fd5097b745d8b405ce49701b5fc2c8 Mon Sep 17 00:00:00 2001
From: Karolina Przerwa <karolina.m.przerwa@gmail.com>
Date: Thu, 24 Oct 2024 18:41:03 +0200
Subject: [PATCH] rules: implement more fields

---
 cds_migrator_kit/migration_config.py          |  7 +-
 cds_migrator_kit/rdm/migration/load/load.py   | 12 +--
 cds_migrator_kit/rdm/migration/streams.yaml   |  2 +-
 .../transform/models/summer_student_report.py | 45 ++++-----
 .../rdm/migration/transform/transform.py      | 99 +++++++++++--------
 .../xml_processing/quality/contributors.py    | 16 ++-
 .../transform/xml_processing/rules/base.py    | 91 +++++++----------
 .../transform/xml_processing/rules/people.py  |  1 -
 .../rules/summer_student_report.py            | 54 +++++++++-
 scripts/copy_collection_files.py              | 27 ++---
 10 files changed, 200 insertions(+), 154 deletions(-)

diff --git a/cds_migrator_kit/migration_config.py b/cds_migrator_kit/migration_config.py
index 9123dc9..5b7eaf9 100644
--- a/cds_migrator_kit/migration_config.py
+++ b/cds_migrator_kit/migration_config.py
@@ -388,7 +388,7 @@ def _(x):  # needed to avoid start time failure with lazy strings
 CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
 CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/migration/streams.yaml"
 
-from invenio_rdm_records.config import RDM_RECORDS_IDENTIFIERS_SCHEMES, always_valid
+from invenio_rdm_records.config import RDM_RECORDS_IDENTIFIERS_SCHEMES, always_valid, RDM_RECORDS_PERSONORG_SCHEMES
 from cds_rdm import schemes
 
 RDM_RECORDS_IDENTIFIERS_SCHEMES = {**RDM_RECORDS_IDENTIFIERS_SCHEMES,
@@ -402,6 +402,11 @@ def _(x):  # needed to avoid start time failure with lazy strings
                                                   "validator": schemes.is_inspire,
                                                   "datacite": "INSPIRE"}}}
 
+RDM_RECORDS_PERSONORG_SCHEMES = {**RDM_RECORDS_PERSONORG_SCHEMES,
+                                 **{"inspire": {"label": _("Inspire"),
+                                                "validator": schemes.is_inspire,
+                                                "datacite": "INSPIRE"}}}
+
 
 CDS_MIGRATOR_KIT_RECORD_STATS_STREAM_CONFIG = dict(
     ####### Search ##############
diff --git a/cds_migrator_kit/rdm/migration/load/load.py b/cds_migrator_kit/rdm/migration/load/load.py
index 96b4b96..6bd51a4 100644
--- a/cds_migrator_kit/rdm/migration/load/load.py
+++ b/cds_migrator_kit/rdm/migration/load/load.py
@@ -155,21 +155,19 @@ def _load_versions(self, entry, logger):
         """Load other versions of the record."""
         versions = entry["versions"]
         legacy_recid = entry["record"]["recid"]
-        record = entry["record"]
-        parent = entry["parent"]
 
         def publish_and_mint_recid(draft, version):
-            record = current_rdm_records_service.publish(system_identity, draft["id"])
+            record_item = current_rdm_records_service.publish(system_identity, draft["id"])
             # mint legacy ids for redirections
             if version == 1:
-                record._record.model.created = arrow.get(
+                record_item._record.model.created = arrow.get(
                     entry["record"]["created"]
                 ).datetime
-                record._record.commit()
+                record_item._record.commit()
                 # it seems more intuitive if we mint the lrecid for parent
                 # but then we get a double redirection
-                legacy_recid_minter(legacy_recid, record._record.parent.model.id)
-            return record
+                legacy_recid_minter(legacy_recid, record_item._record.parent.model.id)
+            return record_item
 
         identity = system_identity  # TODO: load users instead ?
 
diff --git a/cds_migrator_kit/rdm/migration/streams.yaml b/cds_migrator_kit/rdm/migration/streams.yaml
index 44c180e..fdb0777 100644
--- a/cds_migrator_kit/rdm/migration/streams.yaml
+++ b/cds_migrator_kit/rdm/migration/streams.yaml
@@ -16,4 +16,4 @@ records:
   transform:
     files_dump_dir: cds_migrator_kit/rdm/migration/data/summer_student_reports/files/
     missing_users: cds_migrator_kit/rdm/migration/data/users
-    community_id: 63448ca7-c814-4716-b099-a39766df6dbb
+    community_id: 94ae20a9-57f1-4a4e-bcac-4ef802b88e5d
diff --git a/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py b/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py
index 013cc50..d715303 100644
--- a/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py
+++ b/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py
@@ -33,20 +33,17 @@ class CMSSummerStudent(CdsOverdo):
 
     __ignore_keys__ = {
         # decided to ignore
-        # "0247_2",  # DOI, summer student notes do not have it
-        # "0247_a",  # DOI
         "0248_a",  # oai identifier, not needed to migrate, TBD
         "0248_p",  # oai identifier, not needed to migrate, TBD
-        "0248_q",  # does appear in data, what is this field recid 2778897
+        "0248_q",  # full text tag 2778897
         "100__m",  # author's email <-- decided not to keep in RDM,
         "260__c",  # Redundant (more detailed value is in 269__c imprint.pub_date)
         "270__m",  # document contact email
         "595__a",  # always value CERN EDS, not displayed, TODO: do we keep?
         "595__z",  # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean
+        "700__m",  # author's email <-- decided not to keep in RDM,
         "710__5",  # department / organisation author
         "710__a",  # organisation author
-        "700__m",  # Contributors (email)
-        "700__m",  # author's email <-- decided not to keep in RDM,
         "8564_8",  # Files system field
         "8564_s",  # Files system field
         "8564_u",  # Files
@@ -56,22 +53,18 @@ class CMSSummerStudent(CdsOverdo):
         "937__s",  # modification person
         "960__a",  # collection id? usually value 12, to confirm if we ignore
         "980__a",  # collection tag
-        # "980__c",  # MIGRATED/DELETED - it shouldn't even make it here
-
-        # TO Implement (to remove from here)
-        "690C_a",  # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field)
-        # "562__c",  # note
-        # "700__0",  # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID
-        # "693__b",  # beams recid: 2640381
 
-        # TO DECIDE
         # IMPLEMENTED
         # "001"
         # "003"
+        # "035__9",  # Inspire schema
+        # "035__a",  # Inspire id value
+        # "037__a",  # (Report number) alternative identifiers -> scheme "CDS REFERENCE"
         # "041__a",  # languages
+        # "088__a",  # RN (manual introduced?) second report number (so the identifiers schemas are not unique!)
+        # "100__9",  # #BEARD# tag
         # "100__a",
         # "100__u",  # Author affiliation
-        # "100__9",  # #BEARD# tag
         # "246__a",
         # "246__i",  # abbreviation
         # "246__i",  # abbreviation tag, applies to value of 246__A
@@ -83,30 +76,32 @@ class CMSSummerStudent(CdsOverdo):
         # "65017a",  # subject value
         # "6531_9",  # keyword provenance
         # "6531_a",  # keyword
+        # "690C_a",  # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field)
         # "6931_9",  # keyword
         # "6931_a",  # keyword
+        # "693__a",  # accelerator, do we create a custom field?
+        # "693__b",  # beams recid: 2640381
         # "693__e",  # custom_fields.cern:experiments
+        # "693__f",  # facility, do we create a custom field?
+        # "693__p",  # project, do we create a custom field?
+        # "693__s",  # study,  do we create a custom field?
+        # "700__0",  # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID
+        # "700__9",  # #BEARD# tag
         # "700__a",  # Contributors (full name)
         # "700__u",  # Contributors (affiliation)
         # "710__g",  # Collaboration, OK to migrate as corporate contributor (not creator)?
-        # "700__9",  # #BEARD# tag
         # "859__f",  # creator's email, to be used to determine the owner
+        # "906__p",  # names, is it supervisor?
         # "916__n",
         # "916__s",
         # "916__w",
         # "963__a",
-        # "693__a",  # accelerator, do we create a custom field?
-        # "693__f",  # facility, do we create a custom field?
-        # "693__p",  # project, do we create a custom field?
-        # "693__s",  # study,  do we create a custom field?
-        # "906__p",  # names, is it supervisor?
         # "970__a",  # alternative identifier, scheme ALEPH
-        # "037__a",  # (Report number) alternative identifiers -> scheme "CDS REFERENCE"
-        # "088__a",  # RN (manual introduced?) second report number (so the identifiers schemas are not unique!)
-        # "035__9",  # Inspire schema
-        # "035__a",  # Inspire id value
+        "269__a", # imprint place # TODO
+    }
+    _default_fields = {
+        "resource_type": {"id": "publication-technicalnote"}
     }
-    _default_fields = None
 
 
 model = CMSSummerStudent(
diff --git a/cds_migrator_kit/rdm/migration/transform/transform.py b/cds_migrator_kit/rdm/migration/transform/transform.py
index fd90598..50cc2dd 100644
--- a/cds_migrator_kit/rdm/migration/transform/transform.py
+++ b/cds_migrator_kit/rdm/migration/transform/transform.py
@@ -84,9 +84,6 @@ def _recid(self, record_dump):
         """Returns the recid of the record."""
         return str(record_dump.data["recid"])
 
-    def _pids(self, json_entry):
-        return {}
-
     def _bucket_id(self, json_entry):
         return
 
@@ -196,62 +193,85 @@ def get_person_old_db(email):
         return user.id
 
     def _metadata(self, json_entry):
-        def creators(json, key="creators"):
-            _creators = deepcopy(json.get(key, []))
+
+        def affiliations(creator):
             vocab_type = "affiliations"
             service = current_service_registry.get(vocab_type)
             extra_filter = dsl.Q("term", type__id=vocab_type)
-            _creators = list(filter(lambda x: x is not None, _creators))
-            for creator in _creators:
-                affiliations = creator.get("affiliations", [])
-                transformed_aff = []
-                for affiliation_name in affiliations:
+            affiliations = creator.get("affiliations", [])
+            transformed_aff = []
 
-                    title = dsl.Q("match", **{f"title": affiliation_name})
-                    acronym = dsl.Q(
-                        "match_phrase", **{f"acronym.keyword": affiliation_name}
-                    )
-                    title_filter = dsl.query.Bool("should", should=[title, acronym])
 
-                    vocabulary_result = service.search(
-                        system_identity, extra_filter=title_filter | extra_filter
-                    ).to_dict()
-                    if vocabulary_result["hits"]["total"]:
-                        transformed_aff.append(
-                            {
-                                "name": affiliation_name,
-                                "id": vocabulary_result["hits"]["hits"][0]["id"],
-                            }
-                        )
-                    else:
-                        raise UnexpectedValue(
-                            subfield="u",
-                            value=affiliation_name,
-                            field="author",
-                            message=f"Affiliation {affiliation_name} not found.",
-                            stage="vocabulary match",
-                        )
+            for affiliation_name in affiliations:
+
+                title = dsl.Q("match", **{f"title": affiliation_name})
+                acronym = dsl.Q(
+                    "match_phrase", **{f"acronym.keyword": affiliation_name}
+                )
+                title_filter = dsl.query.Bool("should", should=[title, acronym])
+
+                vocabulary_result = service.search(
+                    system_identity, extra_filter=title_filter | extra_filter
+                ).to_dict()
+                if vocabulary_result["hits"]["total"]:
+                    transformed_aff.append(
+                        {
+                            "name": affiliation_name,
+                            "id": vocabulary_result["hits"]["hits"][0]["id"],
+                        }
+                    )
+                else:
+                    raise UnexpectedValue(
+                        subfield="u",
+                        value=affiliation_name,
+                        field="author",
+                        message=f"Affiliation {affiliation_name} not found.",
+                        stage="vocabulary match",
+                    )
                 creator["affiliations"] = transformed_aff
+
+        def creator_identifiers(creator):
+            processed_identifiers = []
+            inner_dict = creator.get("person_or_org", {})
+            identifiers = inner_dict.get("identifiers", [])
+            for identifier in identifiers:
+                # TODO process CDS and CERN Ids when names vocabulary ready
+                if identifier["scheme"] == "inspire":
+                    processed_identifiers.append(identifier)
+            if processed_identifiers:
+
+                inner_dict["identifiers"] = processed_identifiers
+            else:
+                inner_dict.pop("identifiers", None)
+
+        def creators(json, key="creators"):
+            _creators = deepcopy(json.get(key, []))
+            _creators = list(filter(lambda x: x is not None, _creators))
+            for creator in _creators:
+                affiliations(creator)
+                creator_identifiers(creator)
             return _creators
 
-        def _resource_type(data):
-            t = "publication-technicalnote"
-            st = None
-            return {"id": f"{t}-{st}"} if st else {"id": t}
+        def _resource_type(entry):
+            return entry["resource_type"]
 
-        return {
+        metadata = {
             "creators": creators(json_entry),
             "title": json_entry["title"],
             "resource_type": _resource_type(json_entry),
             "description": json_entry.get("description"),
             "publication_date": json_entry.get("publication_date"),
             "contributors": creators(json_entry, key="contributors"),
-            "notes": json_entry.get("internal_notes"),
             "subjects": json_entry.get("subjects"),
             "publisher": json_entry.get("publisher"),
             "additional_descriptions": json_entry.get("additional_descriptions"),
             "identifiers": json_entry.get("identifiers"),
+            "languages": json_entry.get("languages"),
+            "_internal_notes": json_entry.get("internal_notes"),
+            # "imprint": json_entry.get("imprint"), # TODO
         }
+        # filter empty keys
+        return {k: v for k, v in metadata.items() if v}
 
     def _custom_fields(self, json_entry):
 
@@ -384,7 +404,6 @@ def transform(self, entry):
             json_output = {
                 "created": self._created(json_data),
                 "updated": self._updated(record_dump),
-                "pids": self._pids(json_data),
                 "files": self._files(record_dump),
                 "metadata": self._metadata(json_data),
             }
diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py
index b4de792..a403f89 100644
--- a/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py
+++ b/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py
@@ -131,7 +131,6 @@ def get_contributor_role(subfield, role, raise_unexpected=False):
 
 
 def get_contributor_affiliations(info):
-    aff_results = []
     u = info.get("u", "")
     if not u:
         return
@@ -155,15 +154,12 @@ def extract_json_contributor_ids(info):
     author_ids = force_list(info.get("0", ""))
     for author_id in author_ids:
         match = regex.match(author_id)
-        # if match:
-        #     ids.append(
-        #         {"identifier": match.group(3), "scheme": SOURCES[match.group(1)]}
-        #     )
-        #     pass
-    try:
-        ids.append({"identifier": info["inspireid"], "scheme": "inspire"})
-    except KeyError:
-        pass
+        if match:
+            identifier = match.group(3)
+            identifier = identifier.replace("INSPIRE-", "")
+            ids.append(
+                {"identifier": identifier, "scheme": SOURCES[match.group(1)]}
+            )
 
     author_orcid = info.get("k")
     if author_orcid:
diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py
index 861e139..17f04b5 100644
--- a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py
+++ b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py
@@ -80,13 +80,15 @@ def created(self, key, value):
 @model.over("title", "^245__")
 def title(self, key, value):
     """Translates title."""
-    return value.get("a", "TODO")
+    title = StringValue(value.get("a"))
+    title.required()
+    return title.parse()
 
 
 @model.over("description", "^520__")
 def description(self, key, value):
     """Translates description."""
-    description_text = value.get("a")
+    description_text = StringValue(value.get("a")).parse()
 
     return description_text
 
@@ -97,9 +99,9 @@ def description(self, key, value):
 def additional_descriptions(self, key, value):
     """Translates additional description."""
     description_text = value.get("a")
-
+    _additional_description = {}
     if key == "500__":
-        additional_description = {
+        _additional_description = {
             "description": description_text,
             "type": {
                 "id": "other",  # what's with the lang
@@ -111,54 +113,15 @@ def additional_descriptions(self, key, value):
         _abbreviations.append(description_text)
 
         if is_abbreviation:
-            additional_description = {
+            _additional_description = {
                 "description": "Abbreviations: " + "; ".join(_abbreviations),
                 "type": {
                     "id": "other",  # what's with the lang
                 },
             }
-
-    return additional_description
-
-
-def publisher(self, key, value):
-    """Translates publisher."""
-    publisher = value.get("b")
-    if publisher:
-        self["publisher"] = publisher
-    else:
-        raise IgnoreKey("publisher")
-
-
-def publication_date(self, key, value):
-    """Translates publication_date."""
-    publication_date_str = value.get("c")
-    try:
-        date_obj = parse(publication_date_str)
-        self["publication_date"] = date_obj.strftime("%Y-%m-%d")
-        return
-    except ParserError:
-        raise UnexpectedValue(
-            field="publication_date",
-            message=f"Can't parse provided publication date. Value: {publication_date_str}",
-        )
-
-
-@model.over("imprint", "^269__")
-def imprint(self, key, value):
-    """Translates imprint - WARNING - also publisher and publication_date."""
-
-    imprint = {
-        "place": value.get("a"),
-    }
-
-    if not self.get("publication_date"):
-        publication_date(self, key, value)
-
-    if not self.get("publisher"):
-        publisher(self, key, value)
-
-    return imprint
+    if _additional_description:
+        return _additional_description
+    raise IgnoreKey("additional_descriptions")
 
 
 @model.over("creators", "^100__")
@@ -166,7 +129,9 @@ def imprint(self, key, value):
 @require(["a"])
 def creators(self, key, value):
     """Translates the creators field."""
-    role = get_contributor_role("e", value.get("e", "author"))
+    role = value.get("e")
+    if role:
+        role = get_contributor_role("e", role)
     beard = value.get("9")
     if beard is not None and beard != "#BEARD#":
         # checking if anything else stored in this field
@@ -182,8 +147,9 @@ def creators(self, key, value):
         }
     }
     if role:
-        contributor.update({"role": {"id": role}})  # VOCABULARY ID
-    else:
+        contributor.update({"role": {"id": role}})
+    elif not role and key == "700__":
+        # creator does not require role, so if the key == 100 role can be skipped
         contributor.update({"role": {"id": "other"}})
 
     if affiliations:
@@ -193,7 +159,6 @@ def creators(self, key, value):
 
 
 @model.over("contributors", "^700__")
-# @for_each_value
 @require(["a"])
 def contributors(self, key, value):
     """Translates contributors."""
@@ -210,7 +175,7 @@ def languages(self, key, value):
     if lang:
         lang = lang.lower()
     try:
-        return pycountry.languages.lookup(lang).alpha_3.upper()
+        return {"id": pycountry.languages.lookup(lang).alpha_3.lower()}
     except (KeyError, AttributeError, LookupError):
         raise UnexpectedValue(field=key, subfield="a")
 
@@ -246,7 +211,6 @@ def subjects(self, key, value):
 @model.over("custom_fields", "(^693__)")
 def custom_fields(self, key, value):
     """Translates custom fields."""
-
     _custom_fields = self.get("custom_fields", {})
     experiments, accelerators, projects, facilities, studies = [], [], [], [], []
     if key == "693__":
@@ -260,6 +224,15 @@ def custom_fields(self, key, value):
             facilities += [StringValue(v).parse() for v in force_list(value.get("f"))]
         if "s" in value and value.get("s"):
             studies += [StringValue(v).parse() for v in force_list(value.get("s"))]
+        if "b" in value and value.get("b"):
+            # migrates beams field to subjects/keywords
+            _subjects = self.get("subjects", [])
+            subject_value = StringValue(value.get("a")).parse()
+            subject = {
+                "subject": subject_value,
+            }
+            _subjects.append(subject)
+            raise IgnoreKey("custom_fields")
 
         _custom_fields["cern:experiments"] = experiments
         _custom_fields["cern:accelerators"] = accelerators
@@ -271,11 +244,13 @@ def custom_fields(self, key, value):
 
 @model.over("submitter", "(^859__)")
 def record_submitter(self, key, value):
+    """Translate record submitter."""
     return value.get("f")
 
 
 @model.over("record_restriction", "(^963__)")
 def record_restriction(self, key, value):
+    """Translate record restriction field."""
     restr = value.get("a")
     parsed = StringValue(restr).parse()
     if parsed == "PUBLIC":
@@ -300,6 +275,11 @@ def report_number(self, key, value):
 @model.over("identifiers", "^970__")
 @for_each_value
 def aleph_number(self, key, value):
+    """Translates identifiers: ALEPH.
+
+        Attention:  035 might contain aleph number
+        https://github.com/CERNDocumentServer/cds-migrator-kit/issues/21
+    """
     aleph = StringValue(value.get("a")).parse()
     if aleph:
         return {"scheme": "aleph", "identifier": aleph}
@@ -308,6 +288,11 @@ def aleph_number(self, key, value):
 @model.over("identifiers", "^035__")
 @for_each_value
 def inspire_number(self, key, value):
+    """Translates identifiers.
+
+        Attention: might contain aleph number
+        https://github.com/CERNDocumentServer/cds-migrator-kit/issues/21
+    """
     id_value = StringValue(value.get("a")).parse()
     scheme = StringValue(value.get("9")).parse()
 
diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/people.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/people.py
index eae33ab..eb98e96 100644
--- a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/people.py
+++ b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/people.py
@@ -21,7 +21,6 @@
 from cds_dojson.marc21.fields.utils import clean_val, out_strip
 from dojson.utils import force_list
 
-from ..quality.decorators import for_each_value
 # ATTENTION when COPYING! important which model you use as decorator
 from ...models.people import model
 
diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/summer_student_report.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/summer_student_report.py
index e90d96f..54fb2e7 100644
--- a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/summer_student_report.py
+++ b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/summer_student_report.py
@@ -17,9 +17,9 @@
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """Common RDM fields."""
-
-from cds_dojson.marc21.fields.utils import clean_val, out_strip
+from dateutil.parser import ParserError, parse
 from dojson.errors import IgnoreKey
+from dojson.utils import force_list
 
 from ..errors import UnexpectedValue, MissingRequiredField
 from ..quality.decorators import for_each_value
@@ -31,6 +31,7 @@
 @model.over("contributors", "^270__")
 @for_each_value
 def contact_person(self, key, value):
+    """Translates contact person."""
     contributor = {
         "person_or_org": {
             "type": "personal",
@@ -45,7 +46,8 @@ def contact_person(self, key, value):
 @model.over("contributors", "^906__")
 @for_each_value
 def supervisor(self, key, value):
-    supervisor = StringValue(value.get("p"))
+    """Translates supervisor."""
+    supervisor = StringValue(value.get("p")).parse()
     if not supervisor:
         raise MissingRequiredField(field=key, subfield="p",
                                    priority="warning")
@@ -64,6 +66,7 @@ def supervisor(self, key, value):
 @model.over("contributors", "^710__")
 @for_each_value
 def corporate_author(self, key, value):
+    """Translates corporate author."""
     if "g" in value:
         contributor = {
             "person_or_org": {
@@ -77,10 +80,51 @@ def corporate_author(self, key, value):
     if "5" in value:
         department = StringValue(value.get("5")).parse()
         self.get("custom_fields", {}).get("cern:departments", []).append(department)
-        raise IgnoreKey
+        raise IgnoreKey("contributors")
 
 
 @model.over("internal_notes", "^562__")
 @for_each_value
 def note(self, key, value):
-    return StringValue(value.get("c")).parse()
+    """Translates notes"""
+    return {"note": StringValue(value.get("c")).parse()}
+
+
+@model.over("custom_fields", "^690C_")
+def department(self, key, value):
+    """Translates department."""
+    values = force_list(value.get("a"))
+    for v in values:
+        if "PUBL" in v:
+            department = v.replace("PUBL", "").strip()
+            departments = self.get("custom_fields", {}).get("cern:departments", [])
+            if department not in departments:
+                departments.append(department)
+    raise IgnoreKey("custom_fields")
+
+
+@model.over("publication_date", "^269__")
+def imprint_info(self, key, value):
+    """Translates imprint - WARNING - also publisher and publication_date.
+
+        In case of summer student notes this field contains only date
+        but it needs to be reimplemented for the base set of rules -
+        it will contain also imprint place
+    """
+    publication_date_str = value.get("c")
+    _publisher = value.get("b")
+
+    if _publisher and not self.get("publisher"):
+        self["publisher"] = _publisher
+
+    try:
+        date_obj = parse(publication_date_str)
+        return date_obj.strftime("%Y-%m-%d")
+    except ParserError:
+        raise UnexpectedValue(
+            field=key,
+            value=value,
+            message=f"Can't parse provided publication date. Value: {publication_date_str}",
+        )
+
+
diff --git a/scripts/copy_collection_files.py b/scripts/copy_collection_files.py
index f681200..23aa76c 100644
--- a/scripts/copy_collection_files.py
+++ b/scripts/copy_collection_files.py
@@ -2,17 +2,19 @@
 import json
 import os
 import shutil
-
+import io
 
 def copy_collection_file(dump_files, destination_prefix, working_dir):
-    file_log = open(os.path.join(working_dir, "files.log"), "w")
+    file_log = open(os.path.join(working_dir, "files.log"), "wb")
 
     for dump_file in dump_files:
-        with open(os.path.join(working_dir, dump_file), "r") as json_dump:
+        with open(dump_file, "r") as json_dump:
             data = json.load(json_dump)
             for record in data:
                 legacy_record_files = record["files"]
+                recid = record["recid"]
                 for legacy_record_file in legacy_record_files:
+                    print("Processing {}".format(recid))
                     full_path = legacy_record_file["full_path"]
                     # important: last slash
                     path_to_replace = "/opt/cdsweb/var/data/files/"
@@ -22,14 +24,17 @@ def copy_collection_file(dump_files, destination_prefix, working_dir):
                     parent_dest_path = os.path.dirname(destination_path)
                     if not os.path.exists(parent_dest_path):
                         os.makedirs(parent_dest_path)
-                    shutil.copy(full_path, destination_path)
-                    file_log.writelines(
-                        [
-                            f"RECID: {record['recid']},"
-                            f" bibdocid: {legacy_record_file['bibdocid']}"
-                            f" file: {legacy_record_file['full_name']},"
-                            f" destination: {destination_path}"
-                        ]
+                    if not os.path.exists(destination_path):
+                        shutil.copy(full_path, destination_path)
+
+                    filename = legacy_record_file['full_name'].encode("utf-8")
+                    file_log.write(
+                        u"RECID: %s bibdocid: %s file: %s, destination: %s \n" % (
+                            record['recid'],
+                            legacy_record_file['bibdocid'],
+                            filename,
+                            destination_path.encode("utf-8")
+                        )
                     )
     file_log.close()