diff --git a/R/LoadSyntheaTables.r b/R/LoadSyntheaTables.r index 2f1341b..e3be52b 100644 --- a/R/LoadSyntheaTables.r +++ b/R/LoadSyntheaTables.r @@ -33,80 +33,87 @@ LoadSyntheaTables <- syntheaFileLoc, bulkLoad = FALSE) { - csvList <- list.files(syntheaFileLoc, pattern = "*.csv") - conn <- DatabaseConnector::connect(connectionDetails) + if (file.exists(syntheaFileLoc)) { + csvList <- list.files(syntheaFileLoc, pattern = "*.csv") - for (csv in csvList) { - syntheaTable <- - data.table::fread( - file = paste0(syntheaFileLoc, "/", csv), - stringsAsFactors = FALSE, - header = TRUE, - sep = ",", - na.strings = "" - ) + conn <- DatabaseConnector::connect(connectionDetails) - writeLines(paste0("Loading: ", csv)) + for (csv in csvList) { + syntheaTable <- + data.table::fread( + file = paste0(syntheaFileLoc, "/", csv), + stringsAsFactors = FALSE, + header = TRUE, + sep = ",", + na.strings = "" + ) - # experiencing type conversion errors and need to explicitly case some columns - if ("START" %in% colnames(syntheaTable)) { - syntheaTable$START <- - as.Date(syntheaTable$START, format = "%Y-%m-%d") - } - if ("STOP" %in% colnames(syntheaTable)) { - syntheaTable$STOP <- - as.Date(syntheaTable$STOP, format = "%Y-%m-%d") - } - if ("DATE" %in% colnames(syntheaTable)) { - syntheaTable$DATE <- - as.Date(syntheaTable$DATE, format = "%Y-%m-%d") - } - if ("START_DATE" %in% colnames(syntheaTable)) { - syntheaTable$START_DATE <- - as.Date(syntheaTable$START_DATE, format = "%Y-%m-%d") - } - if ("END_DATE" %in% colnames(syntheaTable)) { - syntheaTable$END_DATE <- - as.Date(syntheaTable$END_DATE, format = "%Y-%m-%d") - } - if ("BIRTHDATE" %in% colnames(syntheaTable)) { - syntheaTable$BIRTHDATE <- - as.Date(syntheaTable$BIRTHDATE, format = "%Y-%m-%d") - } - if ("DEATHDATE" %in% colnames(syntheaTable)) { - syntheaTable$DEATHDATE <- - as.Date(syntheaTable$DEATHDATE, format = "%Y-%m-%d") - } - if ("CODE" %in% colnames(syntheaTable)) { - syntheaTable$CODE <- as.character(syntheaTable$CODE) - } - if ("REASONCODE" %in% colnames(syntheaTable)) { - syntheaTable$REASONCODE <- - as.character(syntheaTable$REASONCODE) - } - if ("PHONE" %in% colnames(syntheaTable)) { - syntheaTable$PHONE <- - as.character(syntheaTable$PHONE) - } - if ("UTILIZATION" %in% colnames(syntheaTable)) { - syntheaTable$UTILIZATION <- - as.numeric(syntheaTable$UTILIZATION) + writeLines(paste0("Loading: ", csv)) + + # experiencing type conversion errors and need to explicitly case some columns + if ("START" %in% colnames(syntheaTable)) { + syntheaTable$START <- + as.Date(syntheaTable$START, format = "%Y-%m-%d") + } + if ("STOP" %in% colnames(syntheaTable)) { + syntheaTable$STOP <- + as.Date(syntheaTable$STOP, format = "%Y-%m-%d") + } + if ("DATE" %in% colnames(syntheaTable)) { + syntheaTable$DATE <- + as.Date(syntheaTable$DATE, format = "%Y-%m-%d") + } + if ("START_DATE" %in% colnames(syntheaTable)) { + syntheaTable$START_DATE <- + as.Date(syntheaTable$START_DATE, format = "%Y-%m-%d") + } + if ("END_DATE" %in% colnames(syntheaTable)) { + syntheaTable$END_DATE <- + as.Date(syntheaTable$END_DATE, format = "%Y-%m-%d") + } + if ("BIRTHDATE" %in% colnames(syntheaTable)) { + syntheaTable$BIRTHDATE <- + as.Date(syntheaTable$BIRTHDATE, format = "%Y-%m-%d") + } + if ("DEATHDATE" %in% colnames(syntheaTable)) { + syntheaTable$DEATHDATE <- + as.Date(syntheaTable$DEATHDATE, format = "%Y-%m-%d") + } + if ("CODE" %in% colnames(syntheaTable)) { + syntheaTable$CODE <- as.character(syntheaTable$CODE) + } + if ("REASONCODE" %in% colnames(syntheaTable)) { + syntheaTable$REASONCODE <- + as.character(syntheaTable$REASONCODE) + } + if ("PHONE" %in% colnames(syntheaTable)) { + syntheaTable$PHONE <- + as.character(syntheaTable$PHONE) + } + if ("UTILIZATION" %in% colnames(syntheaTable)) { + syntheaTable$UTILIZATION <- + as.numeric(syntheaTable$UTILIZATION) + } + + suppressWarnings({ + DatabaseConnector::insertTable( + conn, + tableName = paste0(syntheaSchema, ".", strsplit(csv, "[.]")[[1]][1]), + data = as.data.frame(syntheaTable), + dropTableIfExists = FALSE, + createTable = FALSE, + bulkLoad = bulkLoad, + progressBar = TRUE + ) + }) } - suppressWarnings({ - DatabaseConnector::insertTable( - conn, - tableName = paste0(syntheaSchema, ".", strsplit(csv, "[.]")[[1]][1]), - data = as.data.frame(syntheaTable), - dropTableIfExists = FALSE, - createTable = FALSE, - bulkLoad = bulkLoad, - progressBar = TRUE - ) - }) + on.exit(DatabaseConnector::disconnect(conn)) + }else { + stop( + paste0("Synthea File Location specified is invalid: ", syntheaFileLoc, ". Please provide a valid fully qualified (absolute) path to the directory.") + ) } - on.exit(DatabaseConnector::disconnect(conn)) - } diff --git a/R/LoadVocabFromCsv.r b/R/LoadVocabFromCsv.r index 336149e..be58948 100644 --- a/R/LoadVocabFromCsv.r +++ b/R/LoadVocabFromCsv.r @@ -37,111 +37,119 @@ LoadVocabFromCsv <- "drug_strength.csv" ) - fileList <- list.files(vocabFileLoc) + if (file.exists(vocabFileLoc)) { - fileList <- fileList[which(tolower(fileList) %in% csvList)] + fileList <- list.files(vocabFileLoc) - conn <- DatabaseConnector::connect(connectionDetails) + fileList <- fileList[which(tolower(fileList) %in% csvList)] - for (csv in fileList) { - writeLines(paste0("Working on file ", paste0(vocabFileLoc, "/", csv))) + conn <- DatabaseConnector::connect(connectionDetails) - writeLines(" - reading file ") - vocabTable <- - data.table::fread( - file = paste0(vocabFileLoc, "/", csv), - stringsAsFactors = FALSE, - header = TRUE, - sep = delimiter, - na.strings = "" - ) - - if (tolower(csv) == "concept.csv" || tolower(csv) == "concept_relationship.csv" || tolower(csv) == "drug_strength.csv") { - writeLines(" - handling dates") - vocabTable$valid_start_date <- - as.Date(as.character(vocabTable$valid_start_date), "%Y%m%d") - vocabTable$valid_end_date <- - as.Date(as.character(vocabTable$valid_end_date), "%Y%m%d") - vocabTable <- dplyr::tibble(vocabTable) - } + for (csv in fileList) { + writeLines(paste0("Working on file ", paste0(vocabFileLoc, "/", csv))) - writeLines(" - type converting") - vocabTable <- readr::type_convert(df = vocabTable, - col_types = readr::cols(), - na = c("")) %>% - dplyr::tibble() - - if (tolower(csv) == "drug_strength.csv") { - vocabTable <- vocabTable %>% - mutate_at( - vars( - "amount_value", - "amount_unit_concept_id", - "numerator_value", - "numerator_unit_concept_id", - "denominator_value", - "denominator_unit_concept_id", - "box_size" - ), - ~ replace(., is.na(.), 0) + writeLines(" - reading file ") + vocabTable <- + data.table::fread( + file = paste0(vocabFileLoc, "/", csv), + stringsAsFactors = FALSE, + header = TRUE, + sep = delimiter, + na.strings = "" ) - } - chunkSize <- 1e7 - numberOfRowsInVocabTable <- nrow(vocabTable) - numberOfChunks <- - ceiling(x = numberOfRowsInVocabTable / chunkSize) - - writeLines( - paste0( - " - uploading ", - numberOfRowsInVocabTable, - " rows of data in ", - numberOfChunks, - " chunks." - ) - ) + if (tolower(csv) == "concept.csv" || tolower(csv) == "concept_relationship.csv" || tolower(csv) == "drug_strength.csv") { + writeLines(" - handling dates") + vocabTable$valid_start_date <- + as.Date(as.character(vocabTable$valid_start_date), "%Y%m%d") + vocabTable$valid_end_date <- + as.Date(as.character(vocabTable$valid_end_date), "%Y%m%d") + vocabTable <- dplyr::tibble(vocabTable) + } - sql <- - "DELETE FROM @table_name;" - DatabaseConnector::renderTranslateExecuteSql( - connection = conn, - sql = sql, - table_name = paste0(cdmSchema, ".", strsplit(csv, "[.]")[[1]][1]) - ) + writeLines(" - type converting") + vocabTable <- readr::type_convert(df = vocabTable, + col_types = readr::cols(), + na = c("")) %>% + dplyr::tibble() - startRow <- 1 - for (j in (1:numberOfChunks)) { - if (numberOfRowsInVocabTable >= startRow) { - maxRows <- min(numberOfRowsInVocabTable, - startRow + chunkSize) - chunk <- vocabTable[startRow:maxRows, ] - writeLines( - paste0( - " - chunk uploading started on ", - Sys.time(), - " for rows ", - startRow, - " to ", - maxRows + if (tolower(csv) == "drug_strength.csv") { + vocabTable <- vocabTable %>% + mutate_at( + vars( + "amount_value", + "amount_unit_concept_id", + "numerator_value", + "numerator_unit_concept_id", + "denominator_value", + "denominator_unit_concept_id", + "box_size" + ), + ~ replace(., is.na(.), 0) ) + } + + chunkSize <- 1e7 + numberOfRowsInVocabTable <- nrow(vocabTable) + numberOfChunks <- + ceiling(x = numberOfRowsInVocabTable / chunkSize) + + writeLines( + paste0( + " - uploading ", + numberOfRowsInVocabTable, + " rows of data in ", + numberOfChunks, + " chunks." ) - suppressWarnings({ - DatabaseConnector::insertTable( - connection = conn, - tableName = paste0(cdmSchema, ".", strsplit(csv, "[.]")[[1]][1]), - data = chunk, - dropTableIfExists = FALSE, - createTable = FALSE, - bulkLoad = bulkLoad, - progressBar = TRUE + ) + + sql <- + "DELETE FROM @table_name;" + DatabaseConnector::renderTranslateExecuteSql( + connection = conn, + sql = sql, + table_name = paste0(cdmSchema, ".", strsplit(csv, "[.]")[[1]][1]) + ) + + startRow <- 1 + for (j in (1:numberOfChunks)) { + if (numberOfRowsInVocabTable >= startRow) { + maxRows <- min(numberOfRowsInVocabTable, + startRow + chunkSize) + chunk <- vocabTable[startRow:maxRows, ] + writeLines( + paste0( + " - chunk uploading started on ", + Sys.time(), + " for rows ", + startRow, + " to ", + maxRows + ) ) - }) - startRow <- maxRows + 1 + suppressWarnings({ + DatabaseConnector::insertTable( + connection = conn, + tableName = paste0(cdmSchema, ".", strsplit(csv, "[.]")[[1]][1]), + data = chunk, + dropTableIfExists = FALSE, + createTable = FALSE, + bulkLoad = bulkLoad, + progressBar = TRUE + ) + }) + startRow <- maxRows + 1 + } } + writeLines(" - Success") } - writeLines(" - Success") - } - on.exit(DatabaseConnector::disconnect(conn)) + on.exit(DatabaseConnector::disconnect(conn)) + } + else { + stop( + paste0("Vocabulary File Location specified is invalid: ", vocabFileLoc, ". Please provide a valid fully qualified (absolute) path to the directory.") + ) + } } diff --git a/inst/sql/sql_server/synthea_version/v270/create_synthea_tables.sql b/inst/sql/sql_server/synthea_version/v270/create_synthea_tables.sql index 3388f11..83ad2ac 100644 --- a/inst/sql/sql_server/synthea_version/v270/create_synthea_tables.sql +++ b/inst/sql/sql_server/synthea_version/v270/create_synthea_tables.sql @@ -1,7 +1,7 @@ --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.allergies ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -12,7 +12,7 @@ description varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.careplans ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -24,7 +24,7 @@ reasondescription varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.conditions ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -35,7 +35,7 @@ description varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.encounters ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), organization varchar(1000), @@ -80,7 +80,7 @@ procedure_code varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.medications ( -start date, +"start" date, stop date, patient varchar(1000), payer varchar(1000), @@ -181,7 +181,7 @@ utilization numeric --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.devices ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -194,7 +194,7 @@ udi varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.payer_transitions ( patient varchar(1000), - start_year numeric, + "start"_year numeric, end_year numeric, payer varchar(1000), ownership varchar(1000) diff --git a/inst/sql/sql_server/synthea_version/v300/create_synthea_tables.sql b/inst/sql/sql_server/synthea_version/v300/create_synthea_tables.sql index c59c5d5..ad3e1ec 100644 --- a/inst/sql/sql_server/synthea_version/v300/create_synthea_tables.sql +++ b/inst/sql/sql_server/synthea_version/v300/create_synthea_tables.sql @@ -1,7 +1,7 @@ --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.allergies ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -21,7 +21,7 @@ severity2 varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.careplans ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -33,7 +33,7 @@ reasondescription varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.conditions ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -44,7 +44,7 @@ description varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.encounters ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), organization varchar(1000), @@ -89,7 +89,7 @@ procedure_code varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.medications ( -start date, +"start" date, stop date, patient varchar(1000), payer varchar(1000), @@ -163,7 +163,7 @@ healthcare_coverage numeric --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.procedures ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -192,7 +192,7 @@ utilization numeric --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.devices ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -277,7 +277,7 @@ create table @synthea_schema.claims_transactions ( create table @synthea_schema.payer_transitions ( patient varchar(1000), memberid varchar(1000), - start_year date, + "start"_year date, end_year date, payer varchar(1000), secondary_payer varchar(1000), diff --git a/inst/sql/sql_server/synthea_version/v310/create_synthea_tables.sql b/inst/sql/sql_server/synthea_version/v310/create_synthea_tables.sql index 984c16b..95dff91 100644 --- a/inst/sql/sql_server/synthea_version/v310/create_synthea_tables.sql +++ b/inst/sql/sql_server/synthea_version/v310/create_synthea_tables.sql @@ -1,7 +1,7 @@ --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.allergies ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -21,7 +21,7 @@ severity2 varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.careplans ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -33,7 +33,7 @@ reasondescription varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.conditions ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -44,7 +44,7 @@ description varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.encounters ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), organization varchar(1000), @@ -89,7 +89,7 @@ procedure_code varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.medications ( -start date, +"start" date, stop date, patient varchar(1000), payer varchar(1000), @@ -165,7 +165,7 @@ income int --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.procedures ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -195,7 +195,7 @@ encounters int, --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.devices ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -280,7 +280,7 @@ create table @synthea_schema.claims_transactions ( create table @synthea_schema.payer_transitions ( patient varchar(1000), memberid varchar(1000), - start_date date, + "start"_date date, end_date date, payer varchar(1000), secondary_payer varchar(1000), diff --git a/inst/sql/sql_server/synthea_version/v320/create_synthea_tables.sql b/inst/sql/sql_server/synthea_version/v320/create_synthea_tables.sql index 984c16b..95dff91 100644 --- a/inst/sql/sql_server/synthea_version/v320/create_synthea_tables.sql +++ b/inst/sql/sql_server/synthea_version/v320/create_synthea_tables.sql @@ -1,7 +1,7 @@ --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.allergies ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -21,7 +21,7 @@ severity2 varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.careplans ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -33,7 +33,7 @@ reasondescription varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.conditions ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -44,7 +44,7 @@ description varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.encounters ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), organization varchar(1000), @@ -89,7 +89,7 @@ procedure_code varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.medications ( -start date, +"start" date, stop date, patient varchar(1000), payer varchar(1000), @@ -165,7 +165,7 @@ income int --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.procedures ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -195,7 +195,7 @@ encounters int, --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.devices ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -280,7 +280,7 @@ create table @synthea_schema.claims_transactions ( create table @synthea_schema.payer_transitions ( patient varchar(1000), memberid varchar(1000), - start_date date, + "start"_date date, end_date date, payer varchar(1000), secondary_payer varchar(1000), diff --git a/inst/sql/sql_server/synthea_version/v330/create_synthea_tables.sql b/inst/sql/sql_server/synthea_version/v330/create_synthea_tables.sql index 1eedc46..819231d 100644 --- a/inst/sql/sql_server/synthea_version/v330/create_synthea_tables.sql +++ b/inst/sql/sql_server/synthea_version/v330/create_synthea_tables.sql @@ -1,7 +1,7 @@ --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.allergies ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -21,7 +21,7 @@ severity2 varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.careplans ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -33,7 +33,7 @@ reasondescription varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.conditions ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -45,7 +45,7 @@ description varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.encounters ( id varchar(1000), -start date, +"start" date, stop date, patient varchar(1000), organization varchar(1000), @@ -90,7 +90,7 @@ procedure_code varchar(255) --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.medications ( -start date, +"start" date, stop date, patient varchar(1000), payer varchar(1000), @@ -167,7 +167,7 @@ income int --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.procedures ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -198,7 +198,7 @@ encounters int, --HINT DISTRIBUTE_ON_RANDOM create table @synthea_schema.devices ( -start date, +"start" date, stop date, patient varchar(1000), encounter varchar(1000), @@ -283,7 +283,7 @@ create table @synthea_schema.claims_transactions ( create table @synthea_schema.payer_transitions ( patient varchar(1000), memberid varchar(1000), - start_date date, + "start"_date date, end_date date, payer varchar(1000), secondary_payer varchar(1000),