From 010a2f3e02b8460573a41a2ba7ac08a04328d9ba Mon Sep 17 00:00:00 2001 From: Vera Clemens Date: Tue, 28 Nov 2023 16:55:48 +0100 Subject: [PATCH 1/6] feat(k8s.dataverse): add CT.gov collection --- .../persona/nfdi4health/dataverses/ctgov.json | 12 ++++++++++++ k8s/dataverse/persona/nfdi4health/init.sh | 16 +++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) create mode 100644 k8s/dataverse/persona/nfdi4health/dataverses/ctgov.json diff --git a/k8s/dataverse/persona/nfdi4health/dataverses/ctgov.json b/k8s/dataverse/persona/nfdi4health/dataverses/ctgov.json new file mode 100644 index 0000000..d4e1bde --- /dev/null +++ b/k8s/dataverse/persona/nfdi4health/dataverses/ctgov.json @@ -0,0 +1,12 @@ +{ + "name": "CTgov", + "alias": "CTgov", + "dataverseContacts": [ + { + "contactEmail": "fb.studyhub@nfdi4health.de" + } + ], + "affiliation": "NFDI4Health", + "description": "Automatically imported from CT.gov", + "dataverseType": "UNCATEGORIZED" +} \ No newline at end of file diff --git a/k8s/dataverse/persona/nfdi4health/init.sh b/k8s/dataverse/persona/nfdi4health/init.sh index c84fef1..7399cfe 100644 --- a/k8s/dataverse/persona/nfdi4health/init.sh +++ b/k8s/dataverse/persona/nfdi4health/init.sh @@ -85,13 +85,15 @@ while IFS= read -r DATAVERSE; do curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": "@dataverseAdmin", "role": "admin"}' echo - echo "Adding :authenticated-users as dataset creators to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:" - curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": ":authenticated-users", "role": "dsContributor"}' - echo + if [[ $DATAVERSE_ID == "nfdi4health" ]]; then + echo "Adding :authenticated-users as dataset creators to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:" + curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": ":authenticated-users", "role": "dsContributor"}' + echo + fi - if [[ $PARENT_DATAVERSE != "root" ]]; then + if [[ $PARENT_DATAVERSE == "nfdi4health" ]]; then # We add the "Publish permission" for all users only to the sub-dataverses (collection dataverses, e.g. "COVID-19") - # where no datasets are created so it can only be used for linking, not publishing + # of "NFDI4Health" where no datasets are created so it can only be used for linking, not publishing # (only curators should be able to publish) echo "Adding :authenticated-users as dataset publisher to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:" curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": ":authenticated-users", "role": "dsPublisher"}' @@ -100,7 +102,7 @@ while IFS= read -r DATAVERSE; do # The import client and the admin are currently the only automatically configured curator user, all other curators # must be added manually echo "Creating curator group" - curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/groups -d '{"description": "Curator users", "displayName": "Curators", "aliasInOwner": "curators"}' + CURATOR_GROUP_ID=`curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/groups -d '{"description": "Curator users", "displayName": "Curators", "aliasInOwner": "curators"}' | jq .data.identifier -r` echo echo "Adding @service-account-import_client and @dataverseAdmin to curator group" @@ -108,7 +110,7 @@ while IFS= read -r DATAVERSE; do echo echo "Adding curator group as curator to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:" - curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": "&explicit/2-curators", "role": "curator"}' + curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d "{\"assignee\": \"$CURATOR_GROUP_ID\", \"role\": \"curator\"}" echo fi done <<< "${DATAVERSES}" From 8015cb743dfccc2d6beb62bf072572faddca3be3 Mon Sep 17 00:00:00 2001 From: Vera Clemens Date: Tue, 28 Nov 2023 17:41:01 +0100 Subject: [PATCH 2/6] feat(k8s.dataverse): add collections for other external data sources --- .../persona/nfdi4health/dataverses/drks.json | 12 ++++++++++++ .../persona/nfdi4health/dataverses/ictrp.json | 12 ++++++++++++ .../persona/nfdi4health/dataverses/mdm.json | 12 ++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 k8s/dataverse/persona/nfdi4health/dataverses/drks.json create mode 100644 k8s/dataverse/persona/nfdi4health/dataverses/ictrp.json create mode 100644 k8s/dataverse/persona/nfdi4health/dataverses/mdm.json diff --git a/k8s/dataverse/persona/nfdi4health/dataverses/drks.json b/k8s/dataverse/persona/nfdi4health/dataverses/drks.json new file mode 100644 index 0000000..916a620 --- /dev/null +++ b/k8s/dataverse/persona/nfdi4health/dataverses/drks.json @@ -0,0 +1,12 @@ +{ + "name": "DRKS", + "alias": "DRKS", + "dataverseContacts": [ + { + "contactEmail": "fb.studyhub@nfdi4health.de" + } + ], + "affiliation": "NFDI4Health", + "description": "Automatically imported from DRKS", + "dataverseType": "UNCATEGORIZED" +} \ No newline at end of file diff --git a/k8s/dataverse/persona/nfdi4health/dataverses/ictrp.json b/k8s/dataverse/persona/nfdi4health/dataverses/ictrp.json new file mode 100644 index 0000000..8398148 --- /dev/null +++ b/k8s/dataverse/persona/nfdi4health/dataverses/ictrp.json @@ -0,0 +1,12 @@ +{ + "name": "ICTRP", + "alias": "ICTRP", + "dataverseContacts": [ + { + "contactEmail": "fb.studyhub@nfdi4health.de" + } + ], + "affiliation": "NFDI4Health", + "description": "Automatically imported from ICTRP", + "dataverseType": "UNCATEGORIZED" +} \ No newline at end of file diff --git a/k8s/dataverse/persona/nfdi4health/dataverses/mdm.json b/k8s/dataverse/persona/nfdi4health/dataverses/mdm.json new file mode 100644 index 0000000..1e88b90 --- /dev/null +++ b/k8s/dataverse/persona/nfdi4health/dataverses/mdm.json @@ -0,0 +1,12 @@ +{ + "name": "MDM", + "alias": "MDM", + "dataverseContacts": [ + { + "contactEmail": "fb.studyhub@nfdi4health.de" + } + ], + "affiliation": "NFDI4Health", + "description": "Automatically imported from MDM", + "dataverseType": "UNCATEGORIZED" +} \ No newline at end of file From 046d53ca11ec88541afebf5744827404f2acf49a Mon Sep 17 00:00:00 2001 From: Vera Clemens Date: Wed, 17 Jul 2024 12:01:24 +0200 Subject: [PATCH 3/6] feat: add script for loading backup into Dataverse --- k8s/dataverse/README.md | 88 +++++--------------------------- scripts/load_dataverse_backup.sh | 61 ++++++++++++++++++++++ 2 files changed, 73 insertions(+), 76 deletions(-) create mode 100755 scripts/load_dataverse_backup.sh diff --git a/k8s/dataverse/README.md b/k8s/dataverse/README.md index f5464e4..69b3299 100644 --- a/k8s/dataverse/README.md +++ b/k8s/dataverse/README.md @@ -2,88 +2,24 @@ `helm install my-dataverse ./dataverse` # Backup & Restore -## Restore database backup - -### Get a logical backup -#### From S3 -Postgres is configured to automatically create and store a logical backup in S3. You can use the following to find the most recent one. -1. Find the newest backup - - `s3cmd ls s3://$LOGICAL_BACKUP_S3_BUCKET/spilo/$SCOPE$LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX/logical_backups/` - - The env variable values can be found using `kubectl describe pod` on one of the backup job pods. - -2. Copy the backup to your local computer - - `s3cmd get s3://$LOGICAL_BACKUP_S3_BUCKET/spilo/$SCOPE$LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX/logical_backups/1695726061.sql.gz .` - - (replace the file name with the name of the newest backup found in step 1) - -### Copy the backup into the postgres pod - -3. Copy a logical backup from local computer it into the postgres pod - - `kubectl cp 1695726061.sql.gz $POSTGRES_POD_NAME:/tmp/1695726061.sql.gz` - - (replace the file name) - -4. Extract the backup - - `kubectl exec -it $POSTGRES_POD_NAME -- /bin/bash` - - `gunzip /tmp/1695726061.sql.gz` - -5. Empty the database before loading the backup - `kubectl exec -it $POSTGRES_POD_NAME -- psql -U dataverse ` - ``` - -- Recreate the schema - DROP SCHEMA public CASCADE; - CREATE SCHEMA public; - -- Restore default permissions - GRANT ALL ON SCHEMA public TO postgres; - GRANT ALL ON SCHEMA public TO public; - ``` - - (source: https://stackoverflow.com/a/61221726) - -6. Load the backup into the database - - `kubectl exec -it $POSTGRES_POD_NAME -- bash` - - `psql -U dataverse -f /tmp/1690815661.sql template1` - - (replace the file name) - -7. Configure and sync postgres secrets with k8s - - The postgres deployment creates at least three k8s secrets. Since you just loaded a backup they (k8s secret) are out of sync. - Either those k8s secrets must be updated with the values from the just loaded backup or the database must be adapted to the values of the k8s secrets - We update the values within the db. First, we obtain list of accounts to update, then we obtain the passwords and update the db values. - - Get the list of accounts: - - `kubectl get secret | grep ${DEPLOYMENTNAME}-dataverse-postgres.credentials.postgresql.acid.zalan.do ` - - Repeat the following for each account: - - Get the password for the user `dataverse`: - - `kubectl get secrets/dataverse.${DEPLOYMENTNAME}-dataverse-postgres.credentials.postgresql.acid.zalan.do -o=jsonpath="{.data.password}" | base64 -d` - - Update the password for the user `dataverse`: - - `kubectl exec -it $POSTGRES_POD_NAME -- psql -U dataverse "ALTER USER dataverse WITH PASSWORD '...'"` +## Restore database backup -8. Restart the dataverse container +Postgres is configured to automatically create and store a logical backup in S3. You can use the script at +[`scripts/load_dataverse_backup.sh`][1] to load it into a Dataverse deployed on Kubernetes. -9. Start complete SOLR reindex +[1]: https://github.com/nfdi4health/csh-deployment/blob/main/scripts/load_dataverse_backup.sh - `curl http://localhost:8080/api/admin/index/clear` +Before running the script, you must set these env variables: - `curl http://localhost:8080/api/admin/index` +- `DESTINATION_DATAVERSE_NAME`, the deployment name of the destination Dataverse +- `LOGICAL_BACKUP_S3_BUCKET`, the S3 bucket where the backup is located +- `SCOPE` and `LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX`, define the directory inside the S3 bucket where the backup is + located +- (optional) `S3_CONFIG_FILE`, path to a s3cmd config file - (see https://guides.dataverse.org/en/latest/admin/solr-search-index.html#clear-and-reindex) +The values for `LOGICAL_BACKUP_S3_BUCKET`, `SCOPE` and `LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX` can be found using +`kubectl describe pod` on one of the backup job pods. ## Creating a database backup diff --git a/scripts/load_dataverse_backup.sh b/scripts/load_dataverse_backup.sh new file mode 100755 index 0000000..8a553e7 --- /dev/null +++ b/scripts/load_dataverse_backup.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +S3_CONFIG_FILE="${S3_CONFIG_FILE:-'~/.s3cfg'}" + +# Computed from env variables above +LAST_BACKUP_FILE=$(s3cmd ls s3://$LOGICAL_BACKUP_S3_BUCKET/spilo/$SCOPE$LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX/logical_backups/ -c $S3_CONFIG_FILE | sort | tail -n 1 | awk '{print $4}') +POSTGRES_POD_NAME=${DESTINATION_DATAVERSE_NAME}-dataverse-postgres-0 +DATAVERSE_POD_NAME=${DESTINATION_DATAVERSE_NAME}-dataverse-0 + +echo "Downloading backup from S3..." +s3cmd get $LAST_BACKUP_FILE . -c $S3_CONFIG_FILE --skip-existing + +echo "Copying backup to postgres pod..." +kubectl cp $(basename $LAST_BACKUP_FILE) $POSTGRES_POD_NAME:/tmp/ + +echo "Unzipping backup..." +kubectl exec $POSTGRES_POD_NAME -- gunzip /tmp/$(basename $LAST_BACKUP_FILE) + +echo "Emptying database..." +kubectl exec $POSTGRES_POD_NAME -- psql -P pager=off -U dataverse -c "-- Recreate the schema +DROP SCHEMA public CASCADE; +CREATE SCHEMA public; + +-- Restore default permissions +GRANT ALL ON SCHEMA public TO postgres; +GRANT ALL ON SCHEMA public TO public;" +# source: https://stackoverflow.com/a/61221726 + +echo "Loading backup into database..." +kubectl exec $POSTGRES_POD_NAME -- psql -P pager=off -U dataverse -f /tmp/$(basename $LAST_BACKUP_FILE .gz) template1 + + +echo "Updating database passwords..." +kubectl get secret | grep ${DESTINATION_DATAVERSE_NAME}-dataverse-postgres.credentials.postgresql.acid.zalan.do | awk '{print $1}' | while read SECRET; do kubectl exec $POSTGRES_POD_NAME -- psql -P pager=off -U dataverse -c "ALTER USER $(echo $SECRET | awk -F. '{print $1}') WITH PASSWORD '$(kubectl get secrets/$SECRET -o=jsonpath="{.data.password}" | base64 -d)';"; done + +echo "Restarting dataverse pod..." +kubectl delete pod $DATAVERSE_POD_NAME +kubectl wait --for=condition=Ready --timeout=-1s pod/$DATAVERSE_POD_NAME + +# Using port 8081 because 8080 is often already used if currently developing with Dataverse +DATAVERSE_LOCAL_PORT=8081 +DATAVERSE_REMOTE_PORT=8080 + +echo "Starting re-index..." +kubectl port-forward $DATAVERSE_POD_NAME $DATAVERSE_LOCAL_PORT:$DATAVERSE_REMOTE_PORT >/dev/null & +PORT_FORWARD_PID=$! +# Kill the port-forward when this script exits +trap '{ + kill $PORT_FORWARD_PID 2>/dev/null +}' EXIT +# Wait for port to be available +while ! nc -vz localhost $DATAVERSE_LOCAL_PORT >/dev/null 2>&1; do + sleep 0.1 +done +curl http://localhost:$DATAVERSE_LOCAL_PORT/api/admin/index/clear +echo +curl http://localhost:$DATAVERSE_LOCAL_PORT/api/admin/index +echo + +echo +echo "Done! Please wait for the re-indexing to finish, then the backup loading will be complete." From a2d01b2f77a95ed0631594fb92d4bc0bb5b748b6 Mon Sep 17 00:00:00 2001 From: Vera Clemens Date: Wed, 24 Jul 2024 15:23:38 +0200 Subject: [PATCH 4/6] feat(k8s.dataverse): increase storage for postgres and solr --- k8s/dataverse/templates/postgres.yaml | 2 +- k8s/dataverse/templates/solr.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/dataverse/templates/postgres.yaml b/k8s/dataverse/templates/postgres.yaml index 8e85a4f..6bae6b5 100644 --- a/k8s/dataverse/templates/postgres.yaml +++ b/k8s/dataverse/templates/postgres.yaml @@ -13,7 +13,7 @@ spec: enableLogicalBackup: true teamId: {{ .Release.Name }} volume: - size: 4Gi + size: 8Gi numberOfInstances: 2 enableConnectionPooler: true connectionPooler: diff --git a/k8s/dataverse/templates/solr.yaml b/k8s/dataverse/templates/solr.yaml index 8558cba..eacac43 100644 --- a/k8s/dataverse/templates/solr.yaml +++ b/k8s/dataverse/templates/solr.yaml @@ -29,7 +29,7 @@ spec: - ReadWriteOnce resources: requests: - storage: 100Mi + storage: 2Gi --- apiVersion: v1 kind: PersistentVolumeClaim From 9dceba30fa4cf5b2ae8c47789118054b74c93de4 Mon Sep 17 00:00:00 2001 From: Vera Clemens Date: Fri, 26 Jul 2024 14:26:03 +0200 Subject: [PATCH 5/6] feat(k8s.dataverse): increase storage for postgres further --- k8s/dataverse/templates/postgres.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/dataverse/templates/postgres.yaml b/k8s/dataverse/templates/postgres.yaml index 6bae6b5..ca1cf80 100644 --- a/k8s/dataverse/templates/postgres.yaml +++ b/k8s/dataverse/templates/postgres.yaml @@ -13,7 +13,7 @@ spec: enableLogicalBackup: true teamId: {{ .Release.Name }} volume: - size: 8Gi + size: 16Gi numberOfInstances: 2 enableConnectionPooler: true connectionPooler: From 422db61731f6521b15c88298da483c905dd5802e Mon Sep 17 00:00:00 2001 From: Vera Clemens Date: Fri, 26 Jul 2024 14:43:23 +0200 Subject: [PATCH 6/6] feat(k8s.dataverse): increase storage for postgres further --- k8s/dataverse/templates/postgres.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/dataverse/templates/postgres.yaml b/k8s/dataverse/templates/postgres.yaml index ca1cf80..be316b6 100644 --- a/k8s/dataverse/templates/postgres.yaml +++ b/k8s/dataverse/templates/postgres.yaml @@ -13,7 +13,7 @@ spec: enableLogicalBackup: true teamId: {{ .Release.Name }} volume: - size: 16Gi + size: 32Gi numberOfInstances: 2 enableConnectionPooler: true connectionPooler: