Merge pull request #23 from nfdi4health/feat/ctgov

feat: adaptations for ct.gov import
nfdi4health · Jul 29, 2024 · ab509d4 · ab509d4
2 parents 5f88848 + 422db61
commit ab509d4
Show file tree

Hide file tree

Showing 9 changed files with 135 additions and 88 deletions.
diff --git a/k8s/dataverse/README.md b/k8s/dataverse/README.md
@@ -2,88 +2,24 @@
 `helm install my-dataverse ./dataverse`
 
 # Backup & Restore
-## Restore database backup
-
-### Get a logical backup
-#### From S3 
-Postgres is configured to automatically create and store a logical backup in S3. You can use the following to find the most recent one.
-1. Find the newest backup
-
-   `s3cmd ls s3://$LOGICAL_BACKUP_S3_BUCKET/spilo/$SCOPE$LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX/logical_backups/`
-
-   The env variable values can be found using `kubectl describe pod` on one of the backup job pods.
-
-2. Copy the backup to your local computer
-
-   `s3cmd get s3://$LOGICAL_BACKUP_S3_BUCKET/spilo/$SCOPE$LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX/logical_backups/1695726061.sql.gz .`
-
-   (replace the file name with the name of the newest backup found in step 1)
-
-### Copy the backup into the postgres pod
-
-3. Copy a logical backup from local computer it into the postgres pod
-
-   `kubectl cp 1695726061.sql.gz $POSTGRES_POD_NAME:/tmp/1695726061.sql.gz`
-
-   (replace the file name)
-
-4. Extract the backup
-
-   `kubectl exec -it $POSTGRES_POD_NAME -- /bin/bash`
-
-   `gunzip /tmp/1695726061.sql.gz`
-
-5.  Empty the database before loading the backup
-   `kubectl exec -it $POSTGRES_POD_NAME -- psql -U dataverse `
-   ```
-   -- Recreate the schema
-   DROP SCHEMA public CASCADE;
-   CREATE SCHEMA public;
 
-   -- Restore default permissions
-   GRANT ALL ON SCHEMA public TO postgres;
-   GRANT ALL ON SCHEMA public TO public;
-   ```
-
-   (source: https://stackoverflow.com/a/61221726)
-
-6. Load the backup into the database
-
-   `kubectl exec -it $POSTGRES_POD_NAME -- bash`
-
-   `psql -U dataverse -f /tmp/1690815661.sql template1`
-
-   (replace the file name)
-
-7. Configure and sync postgres secrets with k8s
-
-   The  postgres deployment creates at least three k8s secrets. Since you just loaded a backup they (k8s secret) are out of sync.
-   Either those k8s secrets must be updated with the values from the just loaded backup or the database must be adapted to the values of the k8s secrets
-   We update the values within the db. First, we obtain list of accounts to update, then we obtain the passwords and update the db values.
-
-   Get the list of accounts:
-
-   `kubectl get secret | grep ${DEPLOYMENTNAME}-dataverse-postgres.credentials.postgresql.acid.zalan.do `
-
-   Repeat the following for each account:
-
-      Get the password for the user `dataverse`:
-
-      `kubectl get secrets/dataverse.${DEPLOYMENTNAME}-dataverse-postgres.credentials.postgresql.acid.zalan.do  -o=jsonpath="{.data.password}" | base64 -d`
-
-      Update the password for the user `dataverse`:
-
-      `kubectl exec -it $POSTGRES_POD_NAME -- psql -U dataverse "ALTER USER dataverse WITH PASSWORD '...'"`
+## Restore database backup
 
-8. Restart the dataverse container
+Postgres is configured to automatically create and store a logical backup in S3. You can use the script at
+[`scripts/load_dataverse_backup.sh`][1] to load it into a Dataverse deployed on Kubernetes.
 
-9. Start complete SOLR reindex
+[1]: https://github.com/nfdi4health/csh-deployment/blob/main/scripts/load_dataverse_backup.sh
 
-   `curl http://localhost:8080/api/admin/index/clear`
+Before running the script, you must set these env variables:
 
-   `curl http://localhost:8080/api/admin/index`
+- `DESTINATION_DATAVERSE_NAME`, the deployment name of the destination Dataverse
+- `LOGICAL_BACKUP_S3_BUCKET`, the S3 bucket where the backup is located
+- `SCOPE` and `LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX`, define the directory inside the S3 bucket where the backup is
+   located
+- (optional) `S3_CONFIG_FILE`, path to a s3cmd config file
 
-   (see https://guides.dataverse.org/en/latest/admin/solr-search-index.html#clear-and-reindex)
+The values for `LOGICAL_BACKUP_S3_BUCKET`, `SCOPE` and `LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX` can be found using
+`kubectl describe pod` on one of the backup job pods.
 
 ## Creating a database backup
 

diff --git a/k8s/dataverse/persona/nfdi4health/dataverses/ctgov.json b/k8s/dataverse/persona/nfdi4health/dataverses/ctgov.json
@@ -0,0 +1,12 @@
+{
+  "name": "CTgov",
+  "alias": "CTgov",
+  "dataverseContacts": [
+    {
+      "contactEmail": "fb.studyhub@nfdi4health.de"
+    }
+  ],
+  "affiliation": "NFDI4Health",
+  "description": "Automatically imported from CT.gov",
+  "dataverseType": "UNCATEGORIZED"
+}
diff --git a/k8s/dataverse/persona/nfdi4health/dataverses/drks.json b/k8s/dataverse/persona/nfdi4health/dataverses/drks.json
@@ -0,0 +1,12 @@
+{
+  "name": "DRKS",
+  "alias": "DRKS",
+  "dataverseContacts": [
+    {
+      "contactEmail": "fb.studyhub@nfdi4health.de"
+    }
+  ],
+  "affiliation": "NFDI4Health",
+  "description": "Automatically imported from DRKS",
+  "dataverseType": "UNCATEGORIZED"
+}
diff --git a/k8s/dataverse/persona/nfdi4health/dataverses/ictrp.json b/k8s/dataverse/persona/nfdi4health/dataverses/ictrp.json
@@ -0,0 +1,12 @@
+{
+  "name": "ICTRP",
+  "alias": "ICTRP",
+  "dataverseContacts": [
+    {
+      "contactEmail": "fb.studyhub@nfdi4health.de"
+    }
+  ],
+  "affiliation": "NFDI4Health",
+  "description": "Automatically imported from ICTRP",
+  "dataverseType": "UNCATEGORIZED"
+}
diff --git a/k8s/dataverse/persona/nfdi4health/dataverses/mdm.json b/k8s/dataverse/persona/nfdi4health/dataverses/mdm.json
@@ -0,0 +1,12 @@
+{
+  "name": "MDM",
+  "alias": "MDM",
+  "dataverseContacts": [
+    {
+      "contactEmail": "fb.studyhub@nfdi4health.de"
+    }
+  ],
+  "affiliation": "NFDI4Health",
+  "description": "Automatically imported from MDM",
+  "dataverseType": "UNCATEGORIZED"
+}
diff --git a/k8s/dataverse/persona/nfdi4health/init.sh b/k8s/dataverse/persona/nfdi4health/init.sh
@@ -96,17 +96,19 @@ while IFS= read -r DATAVERSE; do
 #  curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": "@dataverseAdmin", "role": "admin"}'
 #  echo
 
-  echo "Adding :authenticated-users as dataset creators to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:"
-  curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": ":authenticated-users", "role": "dsContributor"}'
-  echo
+  if [[ $DATAVERSE_ID == "nfdi4health" ]]; then
+    echo "Adding :authenticated-users as dataset creators to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:"
+    curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": ":authenticated-users", "role": "dsContributor"}'
+    echo
 
-  echo "Adding :authenticated-users as dataset permission admins to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:"
-  curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": ":authenticated-users", "role": "dsPermAdmin"}'
-  echo
+    echo "Adding :authenticated-users as dataset permission admins to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:"
+    curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": ":authenticated-users", "role": "dsPermAdmin"}'
+    echo
+  fi
 
-  if [[ $PARENT_DATAVERSE != "root" ]]; then
+  if [[ $PARENT_DATAVERSE == "nfdi4health" ]]; then
     # We add the "Publish permission" for all users only to the sub-dataverses (collection dataverses, e.g. "COVID-19")
-    # where no datasets are created so it can only be used for linking, not publishing
+    # of "NFDI4Health" where no datasets are created so it can only be used for linking, not publishing
     # (only curators should be able to publish)
     echo "Adding :authenticated-users as dataset publisher to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:"
     curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": ":authenticated-users", "role": "dsPublisher"}'
@@ -115,15 +117,15 @@ while IFS= read -r DATAVERSE; do
     # The import client and the admin are currently the only automatically configured curator user, all other curators
     # must be added manually
     echo "Creating curator group"
-    curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/groups -d '{"description": "Curator users", "displayName": "Curators", "aliasInOwner": "curators"}'
+    CURATOR_GROUP_ID=`curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/groups -d '{"description": "Curator users", "displayName": "Curators", "aliasInOwner": "curators"}' | jq .data.identifier -r`
     echo
 
     echo "Adding @service-account-import_client and @dataverseAdmin to curator group"
     curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/groups/curators/roleAssignees -d '["@service-account-import_client", "@dataverseAdmin"]'
     echo
 
     echo "Adding curator group as curator to dataverse $PARENT_DATAVERSE/$DATAVERSE_ID:"
-    curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d '{"assignee": "&explicit/2-curators", "role": "curator"}'
+    curl -s -H "X-Dataverse-key:$API_TOKEN" -X POST -H "Content-Type: application/json" $DATAVERSE_URL/api/dataverses/$DATAVERSE_ID/assignments -d "{\"assignee\": \"$CURATOR_GROUP_ID\", \"role\": \"curator\"}"
     echo
   fi
 done <<< "${DATAVERSES}"

diff --git a/k8s/dataverse/templates/postgres.yaml b/k8s/dataverse/templates/postgres.yaml
@@ -13,7 +13,7 @@ spec:
   enableLogicalBackup: true
   teamId:  {{ .Release.Name }}
   volume:
-    size: 4Gi
+    size: 32Gi
   numberOfInstances: 2
   enableConnectionPooler: true
   connectionPooler:

diff --git a/k8s/dataverse/templates/solr.yaml b/k8s/dataverse/templates/solr.yaml
@@ -29,7 +29,7 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 100Mi
+      storage: 2Gi
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim

diff --git a/scripts/load_dataverse_backup.sh b/scripts/load_dataverse_backup.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+S3_CONFIG_FILE="${S3_CONFIG_FILE:-'~/.s3cfg'}"
+
+# Computed from env variables above
+LAST_BACKUP_FILE=$(s3cmd ls s3://$LOGICAL_BACKUP_S3_BUCKET/spilo/$SCOPE$LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX/logical_backups/ -c $S3_CONFIG_FILE | sort | tail -n 1 | awk '{print $4}')
+POSTGRES_POD_NAME=${DESTINATION_DATAVERSE_NAME}-dataverse-postgres-0
+DATAVERSE_POD_NAME=${DESTINATION_DATAVERSE_NAME}-dataverse-0
+
+echo "Downloading backup from S3..."
+s3cmd get $LAST_BACKUP_FILE . -c $S3_CONFIG_FILE --skip-existing
+
+echo "Copying backup to postgres pod..."
+kubectl cp $(basename $LAST_BACKUP_FILE) $POSTGRES_POD_NAME:/tmp/
+
+echo "Unzipping backup..."
+kubectl exec $POSTGRES_POD_NAME -- gunzip /tmp/$(basename $LAST_BACKUP_FILE)
+
+echo "Emptying database..."
+kubectl exec $POSTGRES_POD_NAME -- psql -P pager=off -U dataverse -c "-- Recreate the schema
+DROP SCHEMA public CASCADE;
+CREATE SCHEMA public;
+
+-- Restore default permissions
+GRANT ALL ON SCHEMA public TO postgres;
+GRANT ALL ON SCHEMA public TO public;"
+# source: https://stackoverflow.com/a/61221726
+
+echo "Loading backup into database..."
+kubectl exec $POSTGRES_POD_NAME -- psql -P pager=off -U dataverse -f /tmp/$(basename $LAST_BACKUP_FILE .gz) template1
+
+
+echo "Updating database passwords..."
+kubectl get secret | grep ${DESTINATION_DATAVERSE_NAME}-dataverse-postgres.credentials.postgresql.acid.zalan.do | awk '{print $1}' | while read SECRET; do kubectl exec $POSTGRES_POD_NAME -- psql -P pager=off -U dataverse -c "ALTER USER $(echo $SECRET | awk -F. '{print $1}') WITH PASSWORD '$(kubectl get secrets/$SECRET -o=jsonpath="{.data.password}" | base64 -d)';"; done
+
+echo "Restarting dataverse pod..."
+kubectl delete pod $DATAVERSE_POD_NAME
+kubectl wait --for=condition=Ready --timeout=-1s pod/$DATAVERSE_POD_NAME
+
+# Using port 8081 because 8080 is often already used if currently developing with Dataverse
+DATAVERSE_LOCAL_PORT=8081
+DATAVERSE_REMOTE_PORT=8080
+
+echo "Starting re-index..."
+kubectl port-forward $DATAVERSE_POD_NAME $DATAVERSE_LOCAL_PORT:$DATAVERSE_REMOTE_PORT >/dev/null &
+PORT_FORWARD_PID=$!
+# Kill the port-forward when this script exits
+trap '{
+    kill $PORT_FORWARD_PID 2>/dev/null
+}' EXIT
+# Wait for port to be available
+while ! nc -vz localhost $DATAVERSE_LOCAL_PORT >/dev/null 2>&1; do
+    sleep 0.1
+done
+curl http://localhost:$DATAVERSE_LOCAL_PORT/api/admin/index/clear
+echo
+curl http://localhost:$DATAVERSE_LOCAL_PORT/api/admin/index
+echo
+
+echo
+echo "Done! Please wait for the re-indexing to finish, then the backup loading will be complete."