diff --git a/.Rbuildignore b/.Rbuildignore index 28829a6..a7f5b35 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,4 @@ ^\.env$ ^doc$ ^Meta$ +^scripts$ diff --git a/DESCRIPTION b/DESCRIPTION index 6849728..babf18e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ohcleandat Type: Package Title: One Health Data Cleaning and Quality Checking Package -Version: 0.3.11 +Version: 0.3.12 Authors@R: c( person("Collin", "Schwantes", email = "schwantes@ecohealthalliance.org", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-4014-4896")), person("Johana", "Teigen", email = "teigen@ecohealthalliance.org", role = "aut", comment = c(ORCID = "0000-0002-6209-2321")), diff --git a/NEWS.md b/NEWS.md index 5dcb9ef..ab8e724 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# ohcleandat 0.3.12 + +* `expand_frictionless_metadata` can add and remove fields from the metadata depending +on the structural metadata supplied. + # ohcleandat 0.3.11 * obfuscate gps can now handle NAs diff --git a/R/create_structural_metadata.R b/R/create_structural_metadata.R index a207b41..546ad92 100644 --- a/R/create_structural_metadata.R +++ b/R/create_structural_metadata.R @@ -15,13 +15,13 @@ #' #' The metadata table produced has the following elements #' -#' `name` = The name of the field. This is taken as is from `data`. -#' `description` = Description of that field. May be provided by controlled vocabulary -#' `units` = Units of measure for that field. May or may not apply -#' `term_uri` = Universal Resource Identifier for a term from a controlled vocabulary or schema -#' `comments` = Free text providing additional details about the field -#' `primary_key` = `TRUE` or `FALSE`, Uniquely identifies each record in the data -#' `foreign_key` = `TRUE` or `FALSE`, Allows for linkages between data sets. Uniquely identifies +#' - `name` = The name of the field. This is taken as is from `data`. +#' - `description` = Description of that field. May be provided by controlled vocabulary +#' - `units` = Units of measure for that field. May or may not apply +#' - `term_uri` = Universal Resource Identifier for a term from a controlled vocabulary or schema +#' - `comments` = Free text providing additional details about the field +#' - `primary_key` = `TRUE` or `FALSE`, Uniquely identifies each record in the data +#' - `foreign_key` = `TRUE` or `FALSE`, Allows for linkages between data sets. Uniquely identifies #' records in a different data set #' #' diff --git a/R/modify_frictionless_metadata.R b/R/modify_frictionless_metadata.R index 7d54c20..d487861 100644 --- a/R/modify_frictionless_metadata.R +++ b/R/modify_frictionless_metadata.R @@ -1,7 +1,9 @@ #' Expand Frictionless Metadata with structural metadata #' -#' Loops over elements in the structural metadata and adds them to frictionless -#' metadata schema. Will overwrite existing values. +#' Loops over elements in the structural metadata and adds +#' them to the frictionless metadata schema. Will overwrite existing values and +#' remove any fields from the datapackage metadata not listed in the structural +#' metadata. #' #' @param structural_metadata Dataframe. Structural metadata from #' `create_structural_metadata` or `update_structural_metadata` @@ -27,7 +29,7 @@ #' # update structural metadata #' write.csv(data_codebook,"my/codebook.csv", row.names = FALSE) #' -#' data_codebook_updated <- read.csv(""my/codebook.csv"") +#' data_codebook_updated <- read.csv("my/codebook.csv") #' #' # create frictionless package - this is done automatically with the #' # deposits package @@ -61,9 +63,29 @@ expand_frictionless_metadata <- function(structural_metadata, ## build up schema based on structural metadata - for(idx in 1:length(my_data_schema$fields)){ + ## drop fields that were removed from the structural metadata + if(nrow(structural_metadata) <= length(my_data_schema$fields)){ + my_data_schema$fields <- my_data_schema$fields[1:nrow(structural_metadata)] + } + + # for each row, update the schema + for(idx in 1:nrow(structural_metadata)){ # item to build out - x <- my_data_schema$fields[[idx]] + ## row may not exist in the original data. + x <- tryCatch( + expr = { + ## get the fields item we want to update + my_data_schema$fields[[idx]] + }, + error = function(e){ + ## use the first index item + msg<- sprintf("Adding %s to frictionless metadata",structural_metadata$name[[idx]]) + message(msg) + my_data_schema$fields[[1]] + } + ) + + for(idy in 1:length(structural_metadata)){ y <- structural_metadata[idx,idy][[1]] @@ -85,6 +107,8 @@ expand_frictionless_metadata <- function(structural_metadata, my_data_schema$fields[[idx]] <- x } + + ## prune the properties of items in the schema, does not remove fields if(prune_datapackage){ my_data_schema <- prune_datapackage(my_data_schema,structural_metadata) } @@ -104,7 +128,7 @@ expand_frictionless_metadata <- function(structural_metadata, } -#' Prune data pacakge +#' Prune field properties in a data package #' #' method to remove properties from the metadata for a dataset in a datapackage #' @@ -136,3 +160,5 @@ prune_datapackage <- function(my_data_schema, structural_metadata){ return(my_data_schema_pruned) } + + diff --git a/man/create_structural_metadata.Rd b/man/create_structural_metadata.Rd index 9445cc4..d6d3ceb 100644 --- a/man/create_structural_metadata.Rd +++ b/man/create_structural_metadata.Rd @@ -31,16 +31,17 @@ generated then joined to pre-existing metadata via field names. } \details{ The metadata table produced has the following elements - -\code{name} = The name of the field. This is taken as is from \code{data}. -\code{description} = Description of that field. May be provided by controlled vocabulary -\code{units} = Units of measure for that field. May or may not apply -\code{term_uri} = Universal Resource Identifier for a term from a controlled vocabulary or schema -\code{comments} = Free text providing additional details about the field -\code{primary_key} = \code{TRUE} or \code{FALSE}, Uniquely identifies each record in the data -\code{foreign_key} = \code{TRUE} or \code{FALSE}, Allows for linkages between data sets. Uniquely identifies +\itemize{ +\item \code{name} = The name of the field. This is taken as is from \code{data}. +\item \code{description} = Description of that field. May be provided by controlled vocabulary +\item \code{units} = Units of measure for that field. May or may not apply +\item \code{term_uri} = Universal Resource Identifier for a term from a controlled vocabulary or schema +\item \code{comments} = Free text providing additional details about the field +\item \code{primary_key} = \code{TRUE} or \code{FALSE}, Uniquely identifies each record in the data +\item \code{foreign_key} = \code{TRUE} or \code{FALSE}, Allows for linkages between data sets. Uniquely identifies records in a different data set } +} \examples{ \dontrun{ df <- data.frame(a = 1:10, b = letters[1:10]) diff --git a/man/expand_frictionless_metadata.Rd b/man/expand_frictionless_metadata.Rd index c207064..5adf538 100644 --- a/man/expand_frictionless_metadata.Rd +++ b/man/expand_frictionless_metadata.Rd @@ -29,8 +29,10 @@ be removed?} Updates the datapackage, returns nothing } \description{ -Loops over elements in the structural metadata and adds them to frictionless -metadata schema. Will overwrite existing values. +Loops over elements in the structural metadata and adds +them to the frictionless metadata schema. Will overwrite existing values and +remove any fields from the datapackage metadata not listed in the structural +metadata. } \examples{ \dontrun{ @@ -45,7 +47,7 @@ data_codebook <- create_structural_metadata(data) # update structural metadata write.csv(data_codebook,"my/codebook.csv", row.names = FALSE) -data_codebook_updated <- read.csv(""my/codebook.csv"") +data_codebook_updated <- read.csv("my/codebook.csv") # create frictionless package - this is done automatically with the # deposits package diff --git a/man/prune_datapackage.Rd b/man/prune_datapackage.Rd index 17dab6e..acd14d8 100644 --- a/man/prune_datapackage.Rd +++ b/man/prune_datapackage.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/modify_frictionless_metadata.R \name{prune_datapackage} \alias{prune_datapackage} -\title{Prune data pacakge} +\title{Prune field properties in a data package} \usage{ prune_datapackage(my_data_schema, structural_metadata) } diff --git a/scripts/debug_expand_df.R b/scripts/debug_expand_df.R new file mode 100644 index 0000000..a6e63df --- /dev/null +++ b/scripts/debug_expand_df.R @@ -0,0 +1,46 @@ +devtools::load_all() + + +# read in data + + + +df <- read.csv("vignettes/data_examples/my_data.csv") + +# create codebook + +structural_metadata <- create_structural_metadata(df,primary_key = "key", + foreign_key = c("measured_by","site_name")) + +# create data package + + +dp <- frictionless::create_package() |> + frictionless::add_resource(resource_name = "my_data", + data = df) + +frictionless::write_package(package = dp, + directory = "vignettes/data_examples") + +expand_frictionless_metadata(structural_metadata = structural_metadata, + resource_name = "my_data", + resource_path = "vignettes/data_examples/my_data.csv", + data_package_path = "vignettes/data_examples/datapackage.json" + ) + +## drop a measured_by column + +df <- read.csv("vignettes/data_examples/my_data.csv") + +# create codebook + +structural_metadata_2 <- create_structural_metadata(df, + primary_key = "key", + foreign_key = c("site_name")) + + +expand_frictionless_metadata(structural_metadata = structural_metadata_2, + resource_name = "my_data", + resource_path = "vignettes/data_examples/my_data.csv", + data_package_path = "vignettes/data_examples/datapackage.json" +) diff --git a/vignettes/data_examples/my_data.csv b/vignettes/data_examples/my_data.csv index 8846be9..b849f8c 100644 --- a/vignettes/data_examples/my_data.csv +++ b/vignettes/data_examples/my_data.csv @@ -1,11 +1,11 @@ -"date","measurement","measured_by","site_name","key" -2024-08-26,43,"Johana","c",1 -2024-08-27,9,"Johana","c",2 -2024-08-28,79,"Johana","c",3 -2024-08-29,17,"Collin","a",4 -2024-08-30,61,"Johana","e",5 -2024-08-31,30,"Collin","b",6 -2024-09-01,58,"Collin","a",7 -2024-09-02,27,"Johana","d",8 -2024-09-03,52,"Johana","d",9 -2024-09-04,82,"Collin","e",10 +date,measurement,measured_by,site_name,key +2024-08-26,43,Johana,c,1 +2024-08-27,9,Johana,c,2 +2024-08-28,79,Johana,c,3 +2024-08-29,17,Collin,a,4 +2024-08-30,61,Johana,e,5 +2024-08-31,30,Collin,b,6 +2024-09-01,58,Collin,a,7 +2024-09-02,27,Johana,d,8 +2024-09-03,52,Johana,d,9 +2024-09-04,82,Collin,e,10