From 9423ed17b45c0ba9778a3bb144f0ac6d3d585103 Mon Sep 17 00:00:00 2001 From: menger5 <143822465+menger5@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:33:48 -0400 Subject: [PATCH 1/5] Create README.md Initial commit --- update_metadata_bdc/v1.0.0/README.md | 45 ++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 update_metadata_bdc/v1.0.0/README.md diff --git a/update_metadata_bdc/v1.0.0/README.md b/update_metadata_bdc/v1.0.0/README.md new file mode 100644 index 0000000..eacdab9 --- /dev/null +++ b/update_metadata_bdc/v1.0.0/README.md @@ -0,0 +1,45 @@ +# Update File Metadata on BioData Catalyst + +This Dockerfile sets up an environment for running an Rscript that generates a metadata manifest file based on all of the files found within a specified project on BioData Catalyst (BDC). + +## Overview + +**Metadata Manifest File** + +
+ +## Usage +The following command can be used to run the docker: +``` +docker pull rtibiocloud/update_metadata_bdc: +docker run -it rtibiocloud/update_metadata_bdc: -c "Rscript /opt/parser/generate_metadata_manifest.R --help" +``` + +Example Docker run command with volume mounting: +```bash +docker run --rm -v ${PWD}:/data -w /data rtibiocloud/update_metadata_bdc: /bin/bash -c " Rscript /opt/parser/generate_metadata_manifest.R -t -p /project_owner/project -o ." +``` + +If not running the docker from the directory with the data, replace `${PWD}` with the actual path on your host system with the PDF outputs. + +
+ +## Build +To build this Docker image, you can use the following command: +``` +docker build --rm -t rtibiocloud/update_metadata_bdc: -f Dockerfile . +``` +Here's what each part of the command does: + +`docker build`: This command tells Docker to build an image. +`--rm`: This flag removes any intermediate containers that are created during the build process, helping to keep your system clean. +`-t rtibiocloud/update_metadata_bdc:v1.0.0`: The -t flag specifies the name and tag for the image. In this case, it's named update_metadata_bdc with version v1.0.0. +`-f Dockerfile`: This flag specifies the Dockerfile to use for building the image. You can replace Dockerfile with the actual name of your Dockerfile if it's different. +`.`: The dot at the end of the command indicates that the build context is the current directory, where the Dockerfile is located. +Running this command will build a Docker image with the name `rtibiocloud/update_metadata_bdc:v1.0.0`. Make sure you are in the directory containing the Dockerfile you want to use for building the image. + + +## Contact +For additional information or assistance, please contact Mike Enger (menger@rti.org). + +################################################################# From 87fb0c4c0f5cf12e35a754b462b9c907264385fc Mon Sep 17 00:00:00 2001 From: menger5 <143822465+menger5@users.noreply.github.com> Date: Mon, 12 Aug 2024 15:34:09 -0400 Subject: [PATCH 2/5] Create generate_metadata_manifest.R Initial commit --- .../v1.0.0/generate_metadata_manifest.R | 284 ++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 update_metadata_bdc/v1.0.0/generate_metadata_manifest.R diff --git a/update_metadata_bdc/v1.0.0/generate_metadata_manifest.R b/update_metadata_bdc/v1.0.0/generate_metadata_manifest.R new file mode 100644 index 0000000..d02786d --- /dev/null +++ b/update_metadata_bdc/v1.0.0/generate_metadata_manifest.R @@ -0,0 +1,284 @@ +#----------------------------------------------------- +# Description: +# This script utilizes the SevenBridges +# API to generate a metadata manifest file for all +# files within a specified project. This manifest +# can then be used to update all files' metadata using +# the "Update Metadata using Manifest File" feature +# on the BDC user interface. +# +# +# Developer: Mike Enger +# Project: +# Date: 12AUG2024 +# +# +# Revisions +# v1.0 initial commit +# +#----------------------------------------------------- + +if(!require('getopt')){install.packages('getopt', dependencies = T); library(getopt)} +if(!require('dplyr')){install.packages('dplyr', dependencies = T); library(dplyr)} +if(!require('httr')){install.packages('httr', dependencies = T); library(httr)} +if(!require('stringr')){install.packages('stringr', dependencies = T); library(stringr)} +if(!require('lubridate')){install.packages('lubridate', dependencies = T); library(lubridate)} +if(!require('sevenbridges2')){install.packages('sevenbridges2', dependencies = T); library(sevenbridges2)} + +#----------------------------------------------------- +# Setup global arguments and command line use +#----------------------------------------------------- + +# Define usage message +usage <- paste("Usage: script_name.r + -- Required Parameters -- + [-t | --token] (Required) + [-p | --project_id] (Required) + [-o | --output_path] (Required) + -- Help Flag -- + [-h | --help] + Example: + script_name.r -t your_token -p project_id -o /path/to/output + \n", sep="") + +# Specify command-line arguments +spec <- matrix(c( + 'token', 't', 1, "character", + 'project_id', 'p', 1, "character", + 'output_path', 'o', 1, "character", + 'help', 'h', 0, "logical" +), byrow=TRUE, ncol=4) + +# Parse command-line arguments +args <- getopt(spec) + +# Display help message if needed +if (!is.null(args$help) || is.null(args$token) || is.null(args$project_id) || is.null(args$output_path)) { + cat(usage) + q(status = 1) +} + +# Assign arguments to variables +token <- args$token +project_id <- args$project_id +output_path <- args$output_path + + + +#----------------------------------------------------- +# Setup logging +#----------------------------------------------------- + +add_to_log <- function(lvl, func, message){ + timestamp <- paste0("[", Sys.time(), "]") + entry <- paste(timestamp, func, toupper(lvl), message, sep = " - ") + cat(paste0(entry, "\n")) +} + + + +#----------------------------------------------------- +# Required Functions and Dataframes +#----------------------------------------------------- + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Initialize file and folder dataframes +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +df_files_empty <- data.frame( + id = character(0), + name = character(0), + upload_directory = character(0), + upload_directory_id = character(0), + size = integer(0), + project = character(0), + type = character(0) +) + +df_folders_empty <- data.frame( + id = character(0), + name = character(0), + parent_id = character(0), + parent_name = character(0) +) + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Function to extract file and folder info +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +add_to_log("info", "setup", "Setting up recursive file and folder extraction function") +extract_file_folder_info <- function(project_id, parent_id=NA, parent_name="root", df_files, df_folders) { + + # initializing offset and combined subdirectory file listing + offset <- 0 + subdirectory_items_list <- list() + + # Make an initial call to the project/directory to see if there are any files + if (nchar(parent_id) == 0){ + subdirectory_items <- a$files$query(project = project_id, limit = 100, offset = offset) + } else{ + subdirectory_items <- a$files$query(parent = parent_id, limit = 100, offset = offset) + } + + # Loop to get all files in current directory + while (length(subdirectory_items$items) > 0) { + if (nchar(parent_id) == 0){ + subdirectory_items <- a$files$query(project = project_id, limit = 100, offset = offset) + } else{ + subdirectory_items <- a$files$query(parent = parent_id, limit = 100, offset = offset) + } + subdirectory_items_list <- c(subdirectory_items_list,subdirectory_items$items) + offset <- offset + length(subdirectory_items$items) + } + + for(item in subdirectory_items_list){ + + if (item$type == "file"){ + add_to_log("info", "extract", paste("Processing file:", item$name)) + file_data_row <- data.frame( + id = item$id, + name = item$name, + upload_directory = parent_name, + upload_directory_id = parent_id, + size = ifelse(is.null(item$size), 0, item$size), + project = item$project, + type = item$type + ) + df_files <- bind_rows(df_files,file_data_row) + } + + # Extracting folder info AND going into folder to extract subdirectory and file info + if (item$type == "folder") { + add_to_log("info", "extract", paste("Processing folder:", item$name)) + folder_data_row <- data.frame( + id = item$id, + name = item$name, + parent_id = item$parent, + parent_name = parent_name + ) + + df_folders <- bind_rows(df_folders, folder_data_row) + + list_df_files_folders <- extract_file_folder_info(project_id = project_id, + parent_id = item$id, + parent_name = item$name, + df_files,df_folders) + df_files <- list_df_files_folders[[1]] + df_folders <- list_df_files_folders[[2]] + } + } + return(list(df_files,df_folders)) +} + + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Function to create the final name column +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +create_final_name <- function(new_name, parent_name) { + if (is.na(parent_name)) { + return(new_name) # If parent_name is NA, return new_name + } + if (parent_name == "Root") { + return(new_name) # If parent_name is "Root", return new_name + } + if (startsWith(parent_name, "Root")) { + parent_name <- sub("^Root", "", parent_name) # Remove "Root/" prefix if present + } + return(paste(parent_name, new_name, sep = "/")) # Combine parent_name and new_name +} + + + +#----------------------------------------------------- +# Main +#----------------------------------------------------- + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Provide required URLs and authenticate +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# URLs required for API calls +api_endpoint <- "https://api.sb.biodatacatalyst.nhlbi.nih.gov/v2" +sb_platform_url <- "https://platform.sb.biodatacatalyst.nhlbi.nih.gov/u/" + +add_to_log("info", "setup", "Starting script and setting up API endpoint and platform URL") + +# Authenticate +add_to_log("info", "setup", "Authenticating with the API") +a <- Auth$new(token = token, url = api_endpoint) + +# Get a specific project and list its files at the root level +add_to_log("info", "get_project", "Retrieving project information") +p <- a$projects$get(id = project_id) +p_root_folder <- p$get_root_folder() + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Extract file and folder info +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +add_to_log("info", "main", "Starting extraction of files and folders") +list_df_files_folders_out <- extract_file_folder_info(project_id, p_root_folder$id, "Root", df_files_empty, df_folders_empty) +df_files <- list_df_files_folders_out[[1]] +df_folders <- list_df_files_folders_out[[2]] + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Build Metadata Manifest +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Join df_files with df_folders to get the parent directory name +df_files <- merge(df_files, df_folders[, c("id", "parent_name")], by.x = "upload_directory_id", by.y = "id", all.x = TRUE) + +# Create new name column in df_files with upload_directory appended +# Skip values with upload_directory of Root +df_files$name <- ifelse(df_files$upload_directory == "Root", + df_files$name, + paste(df_files$upload_directory, df_files$name, sep = "/")) + +# Apply the create_final_name function to compile complete file name +df_files$name <- mapply(create_final_name, df_files$name, df_files$parent_name) + +# Remove unnecessary columns +df_files <- df_files[ , !(names(df_files) %in% c("upload_directory_id", "upload_directory","parent_name", "type"))] + +# Add columns for possible metadata fields +df_files <- cbind( + df_files, + experimental_strategy = "", + library_id = "", + platform = "", + platform_unit_id = "", + file_segment_number = "", + quality_scale = "", + paired_end = "", + reference_genome = "", + investigation = "", + case_id = "", + case_uuid = "", + gender = "", + race = "", + ethnicity = "", + primary_site = "", + disease_type = "", + age_at_diagnosis = "", + vital_status = "", + days_to_death = "", + sample_id = "", + sample_uuid = "", + sample_type = "", + aliquote_id = "", + aliquot_uuid = "", + description = "" +) + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Output final manifest to csv +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +today <- format(Sys.Date(), "%Y%m%d") +output_file_name <- paste0("manifest_", today, ".csv") +output_file_path <- file.path(output_path, output_file_name) +write.csv(df_files, output_file_path, row.names = FALSE) + +add_to_log("info", "main", paste("CSV file written to", output_file_path)) +add_to_log("info", "main", "Metadata Manifest generated successfully") From cb398dc50b8f9f486d27552ba35bca453c1c0d74 Mon Sep 17 00:00:00 2001 From: menger5 <143822465+menger5@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:05:36 -0400 Subject: [PATCH 3/5] Create Dockerfile Initial commit --- update_metadata_bdc/v1.0.0/Dockerfile | 59 +++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 update_metadata_bdc/v1.0.0/Dockerfile diff --git a/update_metadata_bdc/v1.0.0/Dockerfile b/update_metadata_bdc/v1.0.0/Dockerfile new file mode 100644 index 0000000..bc4db2e --- /dev/null +++ b/update_metadata_bdc/v1.0.0/Dockerfile @@ -0,0 +1,59 @@ +#---------------------------------------------------------------- +# Use the official R image as a base +#---------------------------------------------------------------- +FROM r-base:4.3.2 + +#---------------------------------------------------------------- +# Container Metadata +#---------------------------------------------------------------- +LABEL base.image="r-base:v4.3.2" +LABEL maintainer="Mike Enger " +LABEL description="BDC Metadata Manifest Generator" +LABEL software="R, dplyr, httr, lubridate, stringr, sevenbridges2, getopt" +LABEL software-website="https://www.r-project.org/ https://dplyr.tidyverse.org/ hhttps://httr.r-lib.org/ https://github.com/tidyverse/lubridate https://stringr.tidyverse.org/ https://cran.r-project.org/web/packages/sevenbridges2/index.html https://cran.r-project.org/web/packages/getopt/index.html" +LABEL software.version="1.0.0" +LABEL license="GPL-2 | GPL-3 " +LABEL about.tags="RMIP" + +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 + +#---------------------------------------------------------------- +# Install required command line tools and packages +#---------------------------------------------------------------- +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get -qq update && apt-get -y upgrade && \ + apt-get install -y --no-install-recommends \ + curl \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + build-essential && \ + apt-get clean && \ + apt-get autoremove + +#---------------------------------------------------------------- +# Install R Packages +#---------------------------------------------------------------- +ENV R_VERSION 4.3.2 + +# Configure CRAN for package retrieval +RUN echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile +RUN Rscript -e "install.packages('getopt', dependencies = T)" +RUN Rscript -e "install.packages(c('dplyr', 'httr', 'stringr','lubridate','sevenbridges2'), dependencies = T)" +RUN Rscript -e "library('getopt');##### R SESSION INFORMATION #####; sessionInfo()" + +#---------------------------------------------------------------- +# Copy over analysis scripts +#---------------------------------------------------------------- +ADD generate_metadata_manifest.R /opt/generate_metadata_manifest.R + +#---------------------------------------------------------------- +# Set working dir +#---------------------------------------------------------------- +WORKDIR /data/ + +#---------------------------------------------------------------- +# Set default command or entrypoint if needed +#---------------------------------------------------------------- +CMD ["Rscript", "/opt/generate_metadata_manifest.R"] From 62062c999ba91ced6e02015046835df58f1d49ae Mon Sep 17 00:00:00 2001 From: menger5 <143822465+menger5@users.noreply.github.com> Date: Wed, 14 Aug 2024 11:28:44 -0400 Subject: [PATCH 4/5] Add function to obtain current metadata Added function to pull current metadata for all listed files and add to metadata manifest. This will avoid losing any metadata already added to the file. --- .../v1.0.0/generate_metadata_manifest.R | 102 ++++++++++++------ 1 file changed, 70 insertions(+), 32 deletions(-) diff --git a/update_metadata_bdc/v1.0.0/generate_metadata_manifest.R b/update_metadata_bdc/v1.0.0/generate_metadata_manifest.R index d02786d..2bb3e8d 100644 --- a/update_metadata_bdc/v1.0.0/generate_metadata_manifest.R +++ b/update_metadata_bdc/v1.0.0/generate_metadata_manifest.R @@ -24,6 +24,7 @@ if(!require('httr')){install.packages('httr', dependencies = T); library(httr)} if(!require('stringr')){install.packages('stringr', dependencies = T); library(stringr)} if(!require('lubridate')){install.packages('lubridate', dependencies = T); library(lubridate)} if(!require('sevenbridges2')){install.packages('sevenbridges2', dependencies = T); library(sevenbridges2)} +if(!require('jsonlite')){install.packages('jsonlite', dependencies = T); library(jsonlite)} #----------------------------------------------------- # Setup global arguments and command line use @@ -82,7 +83,7 @@ add_to_log <- function(lvl, func, message){ #----------------------------------------------------- #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Initialize file and folder dataframes +# Initialize empty dataframes and lists #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ df_files_empty <- data.frame( @@ -102,6 +103,8 @@ df_folders_empty <- data.frame( parent_name = character(0) ) +metadata_list <- list() + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Function to extract file and folder info #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -188,7 +191,25 @@ create_final_name <- function(new_name, parent_name) { return(paste(parent_name, new_name, sep = "/")) # Combine parent_name and new_name } - +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Function to get existing metadata +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +get_metadata <- function(file_id, auth_token) { + url <- paste0("https://api.sb.biodatacatalyst.nhlbi.nih.gov/v2/files/", file_id, "/metadata") + + response <- VERB( + "GET", + url, + add_headers('X-SBG-Auth-Token' = auth_token), + content_type("application/json"), + accept("application/json") + ) + + # Parse the response content + metadata <- fromJSON(content(response, "text", encoding = "UTF-8"), flatten = TRUE) + metadata$id <- file_id + return(metadata) +} #----------------------------------------------------- # Main @@ -223,7 +244,7 @@ df_files <- list_df_files_folders_out[[1]] df_folders <- list_df_files_folders_out[[2]] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Build Metadata Manifest +# Build empty Metadata Manifest #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Join df_files with df_folders to get the parent directory name @@ -241,36 +262,53 @@ df_files$name <- mapply(create_final_name, df_files$name, df_files$parent_name) # Remove unnecessary columns df_files <- df_files[ , !(names(df_files) %in% c("upload_directory_id", "upload_directory","parent_name", "type"))] -# Add columns for possible metadata fields -df_files <- cbind( - df_files, - experimental_strategy = "", - library_id = "", - platform = "", - platform_unit_id = "", - file_segment_number = "", - quality_scale = "", - paired_end = "", - reference_genome = "", - investigation = "", - case_id = "", - case_uuid = "", - gender = "", - race = "", - ethnicity = "", - primary_site = "", - disease_type = "", - age_at_diagnosis = "", - vital_status = "", - days_to_death = "", - sample_id = "", - sample_uuid = "", - sample_type = "", - aliquote_id = "", - aliquot_uuid = "", - description = "" +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Get existing metadata for all files and add to empty metadata manifest +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Loop through the files dataframe and get metadata for each file +for (i in 1:nrow(df_files)) { + file_id <- df_files$id[i] + metadata <- get_metadata(file_id, token) + metadata_list[[i]] <- metadata +} + +# Find all unique columns from all metadata +all_columns <- unique(unlist(lapply(metadata_list, names))) + +# Ensure each metadata dataframe has all columns +metadata_list <- lapply(metadata_list, function(df) { + missing_cols <- setdiff(all_columns, names(df)) + df[missing_cols] <- "" + return(df) +}) + +# Combine the metadata with the original dataframe +metadata_df <- do.call(rbind, lapply(metadata_list, as.data.frame)) + +# Combine existing metadata with empty metadata manifest +df_combined <- left_join(df_files, metadata_df, by = "id") + +# List of all required metadata fields (in correct order) +required_columns <- c( + "id", "name", "size","project", "experimental_strategy", "library_id", "platform", "platform_unit_id", + "file_segment_number", "quality_scale", "paired_end", "reference_genome", + "investigation", "case_id", "case_uuid", "gender", "race", "ethnicity", + "primary_site", "disease_type", "age_at_diagnosis", "vital_status", + "days_to_death", "sample_id", "sample_uuid", "sample_type", "aliquote_id", + "aliquot_uuid", "description" ) +# Add missing columns with blank (empty string) values +for (col in required_columns) { + if (!(col %in% names(df_combined))) { + df_combined[[col]] <- "" + } +} + +# Reorder the columns in df_combined to desired order +df_combined <- df_combined %>% + select(all_of(required_columns)) + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Output final manifest to csv #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -278,7 +316,7 @@ df_files <- cbind( today <- format(Sys.Date(), "%Y%m%d") output_file_name <- paste0("manifest_", today, ".csv") output_file_path <- file.path(output_path, output_file_name) -write.csv(df_files, output_file_path, row.names = FALSE) +write.csv(df_combined, output_file_path, row.names = FALSE) add_to_log("info", "main", paste("CSV file written to", output_file_path)) add_to_log("info", "main", "Metadata Manifest generated successfully") From ae623f57f8e5406f6b72fe601bcfcab4ac728a54 Mon Sep 17 00:00:00 2001 From: menger5 <143822465+menger5@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:06:19 -0400 Subject: [PATCH 5/5] Update README.md Finished README and added logs from testruns --- update_metadata_bdc/v1.0.0/README.md | 82 +++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/update_metadata_bdc/v1.0.0/README.md b/update_metadata_bdc/v1.0.0/README.md index eacdab9..d5aadb0 100644 --- a/update_metadata_bdc/v1.0.0/README.md +++ b/update_metadata_bdc/v1.0.0/README.md @@ -1,11 +1,14 @@ -# Update File Metadata on BioData Catalyst +# Generate Metadata Manifest on BioData Catalyst -This Dockerfile sets up an environment for running an Rscript that generates a metadata manifest file based on all of the files found within a specified project on BioData Catalyst (BDC). +This Dockerfile sets up an environment for running an Rscript that generates a metadata manifest for all files within a specified project on BioData Catalyst (BDC). ## Overview **Metadata Manifest File** +The metadata manifest file allows for batch updating metadata across multiple files using the BioData Catalyst (BDC) user interface. + +A key challenge in the current system is the inability to generate a metadata manifest for all files within a project, especially when the project contains nested subdirectories. This application overcomes this limitation by recursively identifying and extracting metadata for all files within the specified project, regardless of their directory structure. It pulls the existing metadata for each file, consolidates it into a single manifest file, and outputs it in a format ready for updates. Once updated, this manifest file can be uploaded back to the platform via the user interface, enabling seamless and accurate metadata updates across the entire project.
## Usage @@ -38,8 +41,81 @@ Here's what each part of the command does: `.`: The dot at the end of the command indicates that the build context is the current directory, where the Dockerfile is located. Running this command will build a Docker image with the name `rtibiocloud/update_metadata_bdc:v1.0.0`. Make sure you are in the directory containing the Dockerfile you want to use for building the image. +## Rscript Inputs +| Short Flag | Long Flag | Description | +|:-----:|:--------:|--------------------------------| +| -t | --token | Authentication token | +| -p | --project_id | Project ID | +| -o | --output_path | Path to save the output manifest | +| -h | --help | Display the function usage statement | + +## Rscript Outputs + +The output of this application is a CSV formatted manifest file. The manifest file contains 4 required columns for identifying the correct file in the project. The remaining columns are prespecified metadata fields. Details on these fields and the overall metadata schema can be found within the following webpage: https://sb-biodatacatalyst.readme.io/docs/metadata-schema + +As mentioned above, this output can be updated and uploaded to BDC, using the user interface, to update metadata fields for files in the specified project. + +## Perform a testrun + +`docker run -v ${PWD}/:/data -t rtibiocloud/update_metadata_bdc:v1.0.0 /bin/bash -c "Rscript /opt/generate_metadata_manifest.R -t -p -o ."` + +
+ +``` +root@0a407da6d20a:/data# Rscript /opt/generate_metadata_manifest.R -t -p -o . + + +Loading required package: getopt +Loading required package: dplyr + +Attaching package: ‘dplyr’ + +The following objects are masked from ‘package:stats’: + + filter, lag + +The following objects are masked from ‘package:base’: + + intersect, setdiff, setequal, union + +Loading required package: httr +Loading required package: stringr +Loading required package: lubridate + +Attaching package: ‘lubridate’ + +The following objects are masked from ‘package:base’: + + date, intersect, setdiff, union + +Loading required package: sevenbridges2 +Loading required package: jsonlite +[2024-08-22 18:07:17.506371] - setup - INFO - Setting up recursive file and folder extraction function +[2024-08-22 18:07:17.513716] - setup - INFO - Starting script and setting up API endpoint and platform URL +[2024-08-22 18:07:17.513962] - setup - INFO - Authenticating with the API +[2024-08-22 18:07:17.539544] - get_project - INFO - Retrieving project information +[2024-08-22 18:07:19.057607] - main - INFO - Starting extraction of files and folders +[2024-08-22 18:07:19.405385] - extract - INFO - Processing folder: Harmonized_Data +[2024-08-22 18:07:19.794829] - extract - INFO - Processing folder: RMIP_000_CyTOF +[2024-08-22 18:07:19.98858] - extract - INFO - Processing file: RMIP_000_001_A_001_A.txt +[2024-08-22 18:07:19.989302] - extract - INFO - Processing file: RMIP_000_002_A_001_A.txt +[2024-08-22 18:07:19.989939] - extract - INFO - Processing folder: RMIP_000_scRNA +[2024-08-22 18:07:20.080253] - extract - INFO - Processing folder: RMIP_000_viability +[2024-08-22 18:07:20.126575] - extract - INFO - Processing folder: Raw_Data +[2024-08-22 18:07:20.356785] - extract - INFO - Processing folder: RMIP_000_CyTOF +[2024-08-22 18:07:20.5892] - extract - INFO - Processing file: RMIP_000_001_A_001_A.txt +[2024-08-22 18:07:20.590099] - extract - INFO - Processing file: RMIP_000_002_A_001_A.txt +[2024-08-22 18:07:20.590935] - extract - INFO - Processing folder: RMIP_000_scRNA +[2024-08-22 18:07:20.639246] - extract - INFO - Processing folder: RMIP_000_viability +[2024-08-22 18:07:20.729023] - extract - INFO - Processing folder: Study_Documentation +[2024-08-22 18:07:20.777902] - extract - INFO - Processing folder: Templates +[2024-08-22 18:07:21.013212] - extract - INFO - Processing file: README.md +[2024-08-22 18:07:21.222711] - main - INFO - CSV file written to ./manifest_20240822.csv +[2024-08-22 18:07:21.22298] - main - INFO - Metadata Manifest generated successfully + +``` +
## Contact For additional information or assistance, please contact Mike Enger (menger@rti.org). -#################################################################