Skip to content

Commit

Permalink
transform: files upload prototype
Browse files Browse the repository at this point in the history
  • Loading branch information
kpsherva committed Jul 25, 2024
1 parent 980c32e commit 8d0e990
Show file tree
Hide file tree
Showing 7 changed files with 173 additions and 18 deletions.
1 change: 1 addition & 0 deletions cds_migrator_kit/migration_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,4 @@ def _(x): # needed to avoid start time failure with lazy strings
base_path = os.path.dirname(os.path.realpath(__file__))
logs_dir = os.path.join(base_path, "tmp/logs/")
CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
INVENIO_CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/migration/streams.yaml"
26 changes: 25 additions & 1 deletion cds_migrator_kit/rdm/migration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,32 @@ current_rdm_records_service.indexer.bulk_index((rec.id for rec in records))
```


### To visualise the errors:
### To visualise the errors (locally):

```shell
gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app
```


### Full migration workflow of one collection

#### Legacy

```shell
ssh cds-wn-31 # inveniomigrator tool installed here
kinit cdsrdmeosdev
cd /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/dump
#cd /eos/media/cds/test/rdm/migration/summer-student-notes/dump
inveniomigrator dump records -q '980__:NOTE 037__:CERN-STUDENTS-Note-* -980:DELETED' --file-prefix summer-studends-notes --latest-only --chunk-size=1000
python copy_collection_files.py --dump-folder /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/dump --files-destination /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/files
#python copy_collection_files.py --dump-folder /eos/media/cds/test/rdm/migration/summer-student-notes/dump --files-destination /eos/media/cds/test/rdm/migration/summer-student-notes/files
```


#### Openshift migration pod

```shell
invenio migration run
```

visit https://migration-cds-rdm-dev.app.cern.ch for report
4 changes: 3 additions & 1 deletion cds_migrator_kit/rdm/migration/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import click
from flask.cli import with_appcontext
from flask import current_app

from cds_migrator_kit.rdm.migration.runner import Runner
from cds_migrator_kit.rdm.migration.streams import RecordStreamDefinition
Expand All @@ -26,8 +27,9 @@ def migration():
@with_appcontext
def run():
"""Run."""
stream_config = current_app.config["INVENIO_CDS_MIGRATOR_KIT_STREAM_CONFIG"]
runner = Runner(
stream_definitions=[RecordStreamDefinition],
config_filepath=Path("cds_migrator_kit/rdm/migration/streams.yaml").absolute(),
config_filepath=Path(stream_config).absolute(),
)
runner.run()
30 changes: 29 additions & 1 deletion cds_migrator_kit/rdm/migration/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
from invenio_access.permissions import system_identity
from invenio_rdm_migrator.load.base import Load
from invenio_rdm_records.proxies import current_rdm_records_service
import os


def import_legacy_files(filepath):
"""Download file from legacy."""
filestream = open(filepath, "rb")
return filestream


class CDSRecordServiceLoad(Load):
Expand All @@ -29,10 +36,31 @@ def _prepare(self, entry):

def _load(self, entry):
"""Use the services to load the entries."""
identity = system_identity # Should we create an idenity for the migration?
identity = system_identity # Should we create an identity for the migration?
draft = current_rdm_records_service.create(identity, entry["record"]["json"])
draft_files = entry["draft_files"]

for file in draft_files:
current_rdm_records_service.draft_files.init_files(
identity,
draft.id,
data=[
{"key": file["key"], "metadata": file["metadata"],
"access": {"hidden": False}}],
)
current_rdm_records_service.draft_files.set_file_content(
identity, draft.id, file["key"],
import_legacy_files(file["eos_tmp_path"])
)
result = current_rdm_records_service.draft_files.commit_file(identity,
draft.id,
file["key"])
legacy_checksum = f"md5:{file['checksum']}"
new_checksum = result.to_dict()["checksum"]
assert legacy_checksum == new_checksum
current_rdm_records_service.publish(system_identity, draft["id"])


def _cleanup(self, *args, **kwargs):
"""Cleanup the entries."""
pass
2 changes: 2 additions & 0 deletions cds_migrator_kit/rdm/migration/streams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ new_secret_key: CHANGE_ME
records:
extract:
dirpath: cds_migrator_kit/rdm/migration/data/summer_student_reports
transform:
files_dump_dir: cds_migrator_kit/rdm/migration/data/files/
75 changes: 60 additions & 15 deletions cds_migrator_kit/rdm/migration/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import datetime
import logging
from pathlib import Path

from invenio_rdm_migrator.streams.records.transform import (
RDMRecordEntry,
Expand Down Expand Up @@ -84,7 +85,8 @@ def _pids(self, json_entry):

def _files(self, record_dump):
"""Transform the files of a record."""
files = record_dump.prepare_files()
record_dump.prepare_files()
files = record_dump.files
return {"enabled": True if files else False}

def _communities(self, json_entry):
Expand Down Expand Up @@ -158,6 +160,11 @@ def transform(self, entry):
class CDSToRDMRecordTransform(RDMRecordTransform):
"""CDSToRDMRecordTransform."""

def __init__(self, workers=None, throw=False, files_dump_dir=None):
"""Constructor."""
self.files_dump_dir = Path(files_dump_dir).absolute().as_posix()
super().__init__(workers, throw)

def _community_id(self, entry, record):
communities = record.get("communities")
if communities:
Expand Down Expand Up @@ -201,26 +208,64 @@ def _transform(self, entry):
}

def _record(self, entry):
# could be in draft as well, depends on how we decide to publish
return CDSToRDMRecordEntry().transform(entry)

def _draft(self, entry):
return None

def _draft_files(self, entry):
return None
"""Point to temporary eos storage to import files from."""
_files = entry["files"]
draft_files = []
legacy_path_root = Path("/opt/cdsweb/var/data/files/")
tmp_eos_root = Path(self.files_dump_dir)

for file in _files:
full_path = Path(file["full_path"])
draft_files.append({
"eos_tmp_path": tmp_eos_root / full_path.relative_to(legacy_path_root),
"key": file["full_name"],
"metadata": {},
"mimetype": file["mime"],
"checksum": file["checksum"]
})
return draft_files

def _record_files(self, entry, record):
# files = entry["json"].get("_files", [])
# return [
# {
# "key": f["key"],
# "object_version": {
# "file": {
# "size": f["size"],
# "checksum": f["checksum"],
# },
# },
# }
# for f in files
# ]
"""Record files entries transform."""
# TO implement if we decide not to go via draft publish
return []

#
#
# "files": [
# {
# "comment": null,
# "status": "firerole: allow group \"council-full [CERN]\"\ndeny until \"1996-02-01\"\nallow all",
# "version": 1,
# "encoding": null,
# "creation_date": "2009-11-03T12:29:06+00:00",
# "bibdocid": 502379,
# "mime": "application/pdf",
# "full_name": "CM-P00080632-e.pdf",
# "superformat": ".pdf",
# "recids_doctype": [[32097, "Main", "CM-P00080632-e.pdf"]],
# "path": "/opt/cdsweb/var/data/files/g50/502379/CM-P00080632-e.pdf;1",
# "size": 5033532,
# "license": {},
# "modification_date": "2009-11-03T12:29:06+00:00",
# "copyright": {},
# "url": "http://cds.cern.ch/record/32097/files/CM-P00080632-e.pdf",
# "checksum": "ed797ce5d024dcff0040db79c3396da9",
# "description": "English",
# "format": ".pdf",
# "name": "CM-P00080632-e",
# "subformat": "",
# "etag": "\"502379.pdf1\"",
# "recid": 32097,
# "flags": [],
# "hidden": false,
# "type": "Main",
# "full_path": "/opt/cdsweb/var/data/files/g50/502379/CM-P00080632-e.pdf;1"
# },]
53 changes: 53 additions & 0 deletions scripts/copy_collection_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import json
import os
import shutil
import argparse


def copy_collection_file(dump_files, destination_prefix, working_dir):
file_log = open(os.path.join(working_dir, "files.log"), "w")

for dump_file in dump_files:
with open(os.path.join(working_dir, dump_file), "r") as json_dump:
data = json.load(json_dump)
for record in data:
legacy_record_files = record["files"]
for legacy_record_file in legacy_record_files:
full_path = legacy_record_file["full_path"]
# important: last slash
path_to_replace = "/opt/cdsweb/var/data/files/"

rel_path = full_path.replace(path_to_replace, "")
destination_path = os.path.join(destination_prefix, rel_path)
parent_dest_path = os.path.dirname(destination_path)
if not os.path.exists(parent_dest_path):
os.makedirs(parent_dest_path)
shutil.copy(full_path, destination_path)
file_log.writelines(
[f"RECID: {record['recid']},"
f" bibdocid: {legacy_record_file['bibdocid']}"
f" file: {legacy_record_file['full_name']},"
f" destination: {destination_path}"])
file_log.close()


def get_dump_files_paths(working_dir):
dump_files = []
# get all dump files in the folder
for (root, dirs, files) in os.walk(working_dir, topdown=True):
dump_files += [os.path.join(root, filename) for filename in files]
return dump_files



if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Copy files over script')
parser.add_argument('--dump-folder', metavar='path', required=True,
help='the path to dump folder')
parser.add_argument('--files-destination', metavar='path', required=True,
help='path to destination folder on EOS')
args = parser.parse_args()

dump_folder = args.dump_folder

collection_dump_file_list = get_dump_files_paths(dump_folder)

0 comments on commit 8d0e990

Please sign in to comment.