From 7c8f30695745aa304b73b91bfe8c397cb8eb7677 Mon Sep 17 00:00:00 2001 From: Eli Daniels <90638398+eli-daniels@users.noreply.github.com> Date: Mon, 28 Oct 2024 18:51:16 +0100 Subject: [PATCH] feat: Add `col.types` argument to `duckdb_read_csv()` (#445) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add col.types to duckdb_read_csv * ref : duckdb_read_csv * feat : add dates test to read_csv_duckdb * post check addition * refactor tests for more insightful test messages * Apply suggestions from code review * fix: name to names * doc: add link to duckdb data type docs * Formatting * Document --------- Co-authored-by: Kirill Müller Co-authored-by: Kirill Müller --- R/csv.R | 108 +++++++++++++++++++++++++------ man/duckdb_read_csv.Rd | 26 ++++++++ tests/testthat/test-read.R | 129 ++++++++++++++++++++++++++++++++++++- 3 files changed, 243 insertions(+), 20 deletions(-) diff --git a/R/csv.R b/R/csv.R index 5debde7ac..e8bcc244b 100644 --- a/R/csv.R +++ b/R/csv.R @@ -3,6 +3,8 @@ #' Directly reads a CSV file into DuckDB, tries to detect and create the correct schema for it. #' This usually is much faster than reading the data into R and writing it to DuckDB. #' +#' If the table already exists in the database, the csv is appended to it. Otherwise the table is created. +#' #' @inheritParams duckdb_register #' @param files One or more CSV file names, should all have the same structure though #' @param ... Reserved for future extensions, must be empty. @@ -12,6 +14,9 @@ #' @param delim Which field separator should be used #' @param quote Which quote character is used for columns in the CSV file #' @param col.names Override the detected or generated column names +#' @param col.types Character vector of column types in the same order as col.names, +#' or a named character vector where names are column names and types pairs. +#' Valid types are \href{https://duckdb.org/docs/sql/data_types/overview.html}{DuckDB data types}, e.g. VARCHAR, DOUBLE, DATE, BIGINT, BOOLEAN, etc. #' @param lower.case.names Transform column names to lower case #' @param sep Alias for delim for compatibility #' @param transaction Should a transaction be used for the entire operation @@ -30,6 +35,25 @@ #' dbReadTable(con, "data") #' #' dbDisconnect(con) +#' +#' +#' # Providing data types for columns +#' path <- tempfile(fileext = ".csv") +#' write.csv(iris, path, row.names = FALSE) +#' +#' con <- dbConnect(duckdb()) +#' duckdb_read_csv(con, "iris", path, +#' col.types = c( +#' Sepal.Length = "DOUBLE", +#' Sepal.Width = "DOUBLE", +#' Petal.Length = "DOUBLE", +#' Petal.Width = "DOUBLE", +#' Species = "VARCHAR" +#' ) +#' ) +#' dbReadTable(con, "iris") +#' dbDisconnect(con) +#' duckdb_read_csv <- function( conn, name, @@ -41,6 +65,7 @@ duckdb_read_csv <- function( delim = ",", quote = "\"", col.names = NULL, + col.types = NULL, lower.case.names = FALSE, sep = delim, transaction = TRUE, @@ -51,16 +76,25 @@ duckdb_read_csv <- function( if (length(na.strings) > 1) stop("na.strings must be of length 1") if (!missing(sep)) delim <- sep - headers <- lapply(files, utils::read.csv, sep = delim, na.strings = na.strings, quote = quote, nrows = nrow.check, header = header, ...) + headers <- lapply(files, utils::read.csv, + sep = delim, na.strings = na.strings, + quote = quote, nrows = nrow.check, header = header, ... + ) if (length(files) > 1) { nn <- sapply(headers, ncol) if (!all(nn == nn[1])) stop("Files have different numbers of columns") nms <- sapply(headers, names) - if (!all(nms == nms[, 1])) stop("Files have different variable names") - types <- sapply(headers, function(df) sapply(df, dbDataType, dbObj = conn)) - if (!all(types == types[, 1])) stop("Files have different variable types") + if (!all(nms == nms[, 1])) stop("Files have different variable names or order") + if (is.null(col.types)) { + types <- sapply(headers, function(df) sapply(df, dbDataType, dbObj = conn)) + if (!all(types == types[, 1])) stop("Files have different variable types") + } } + fields <- set_csv_fields(found = headers[[1]][FALSE, , drop = FALSE], col.names, col.types) + + if (lower.case.names) { names(fields) <- tolower(names(fields)) } + if (transaction) { dbBegin(conn) on.exit(tryCatch(dbRollback(conn), error = function(e) {})) @@ -69,21 +103,7 @@ duckdb_read_csv <- function( tablename <- dbQuoteIdentifier(conn, name) if (!dbExistsTable(conn, tablename)) { - if (lower.case.names) names(headers[[1]]) <- tolower(names(headers[[1]])) - if (!is.null(col.names)) { - if (lower.case.names) { - warning("Ignoring lower.case.names parameter as overriding col.names are supplied.") - } - col.names <- as.character(col.names) - if (length(unique(col.names)) != length(names(headers[[1]]))) { - stop( - "You supplied ", length(unique(col.names)), " unique column names, but file has ", - length(names(headers[[1]])), " columns." - ) - } - names(headers[[1]]) <- col.names - } - dbCreateTable(conn, tablename, headers[[1]], temporary = temporary) + dbCreateTable(conn, tablename, fields, temporary = temporary) } for (i in seq_along(files)) { @@ -100,6 +120,56 @@ duckdb_read_csv <- function( invisible(out) } + +#' Column names and types logic for duckdb_read_csv() +#' +#' @param found the detected (found) header and types from `utils::read_csv` +#' @param col.names user provided column names +#' @param col.types user provider column types and maybe names too +#' +#' @noRd +#' @return returns a valid fields argument for `dbCreateTable` +set_csv_fields <- function(found, col.names, col.types) { + if (is.null(col.types) && is.null(col.types)) { + return(found) + } + + if (!is.null(names(col.types)) && !is.null(col.names)) { + warning("Ignoring `col.names` as column names provided by `col.types` parameter") + return(col.types) + } + + if (!is.null(col.types)) { + if (length(col.types) != ncol(found)) { + stop( + "You supplied ", length(col.types), " values to `col.names`, but file has ", + ncol(found), " columns." + ) + } + + if (!is.null(names(col.types))) { + return(col.types) + } else { + if (length(col.types) != ncol(found)) { + stop( + "You supplied ", length(col.types), " values to `col.types`, but file has ", + ncol(found), " columns." + ) + } + fields <- col.types + names(fields) <- col.names + return(fields) + } + } else { + fields <- col.types + names(fields) <- names(found) + } + fields +} + + + + #' Deprecated functions #' #' `read_csv_duckdb()` has been superseded by `duckdb_read_csv()`. diff --git a/man/duckdb_read_csv.Rd b/man/duckdb_read_csv.Rd index 0508ae1cd..24707a8cf 100644 --- a/man/duckdb_read_csv.Rd +++ b/man/duckdb_read_csv.Rd @@ -15,6 +15,7 @@ duckdb_read_csv( delim = ",", quote = "\\"", col.names = NULL, + col.types = NULL, lower.case.names = FALSE, sep = delim, transaction = TRUE, @@ -42,6 +43,10 @@ duckdb_read_csv( \item{col.names}{Override the detected or generated column names} +\item{col.types}{Character vector of column types in the same order as col.names, +or a named character vector where names are column names and types pairs. +Valid types are \href{https://duckdb.org/docs/sql/data_types/overview.html}{DuckDB data types}, e.g. VARCHAR, DOUBLE, DATE, BIGINT, BOOLEAN, etc.} + \item{lower.case.names}{Transform column names to lower case} \item{sep}{Alias for delim for compatibility} @@ -57,6 +62,9 @@ The number of rows in the resulted table, invisibly. Directly reads a CSV file into DuckDB, tries to detect and create the correct schema for it. This usually is much faster than reading the data into R and writing it to DuckDB. } +\details{ +If the table already exists in the database, the csv is appended to it. Otherwise the table is created. +} \examples{ \dontshow{if (duckdb:::TEST_RE2) withAutoprint(\{ # examplesIf} con <- dbConnect(duckdb()) @@ -70,5 +78,23 @@ duckdb_read_csv(con, "data", path) dbReadTable(con, "data") dbDisconnect(con) + + +# Providing data types for columns +path <- tempfile(fileext = ".csv") +write.csv(iris, path, row.names = FALSE) + +con <- dbConnect(duckdb()) +duckdb_read_csv(con, "iris", path, + col.types = c( + Sepal.Length = "DOUBLE", + Sepal.Width = "DOUBLE", + Petal.Length = "DOUBLE", + Petal.Width = "DOUBLE", + Species = "VARCHAR" + ) +) +dbReadTable(con, "iris") +dbDisconnect(con) \dontshow{\}) # examplesIf} } diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R index 8f1c82627..dff183496 100644 --- a/tests/testthat/test-read.R +++ b/tests/testthat/test-read.R @@ -4,6 +4,7 @@ test_that("duckdb_read_csv() works as expected", { con <- dbConnect(duckdb()) tf <- tempfile() + tf2 <- tempfile() # default case write.csv(iris, tf, row.names = FALSE) @@ -117,7 +118,7 @@ test_that("duckdb_read_csv() works as expected", { '"num","char","logi","lisst.1","lisst.2","lisst.3","lisst.NA"', '0.5,"yes",TRUE,1,2,3,NA', '2,"no",FALSE,1,2,3,NA', - 'NA,NA,NA,1,2,3,NA' + "NA,NA,NA,1,2,3,NA" ) writeLines(csv, tf3) duckdb_read_csv(con, "na_table", tf3, na.strings = "-") @@ -140,3 +141,129 @@ test_that("duckdb_read_csv() works as expected", { dbDisconnect(con, shutdown = TRUE) }) + +describe("duckdb_read_csv", { + + skip_if_not(TEST_RE2) + tf <- tempfile() + con <- dbConnect(duckdb()) + + it("col.types arg works with vector of types and inferred colnames", { + + # Case with col.types as character vector + write.csv(iris, tf, row.names = FALSE) + duckdb_read_csv(con, "iris", tf, + col.types = c( + "DOUBLE", + "DOUBLE", + "DOUBLE", + "DOUBLE", + "VARCHAR" + ) + ) + + res <- dbReadTable(con, "iris") + res$Species <- as.factor(res$Species) + expect_true(identical(res, iris)) + dbRemoveTable(con, "iris") + + }) + + it("col.types and col.names work together when unnamed", { + + write.csv(iris, tf, row.names = FALSE) + duckdb_read_csv( + con, "iris", tf, + col.names = c("S.Length", "S.Width", "P.Length", "P.Width", "Species"), + col.types = c("DOUBLE", "DOUBLE", "DOUBLE", "DOUBLE", "VARCHAR") + ) + + res <- dbReadTable(con, "iris") + res$Species <- as.factor(res$Species) + iris_renamed <- setNames(iris, c("S.Length", "S.Width", "P.Length", "P.Width", "Species")) + expect_true(identical(res, iris_renamed)) + dbRemoveTable(con, "iris") + + }) + + it("col.types overwrites col.names when col.types is named", { + + write.csv(iris, tf, row.names = FALSE) + expect_warning( + duckdb_read_csv(con, "iris", tf, + col.names = c("A", "B", "C", "D", "E"), + col.types = c( + Sepal.Length = "DOUBLE", + Sepal.Width = "DOUBLE", + Petal.Length = "DOUBLE", + Petal.Width = "DOUBLE", + Species = "VARCHAR" + ) + ) + ) + res <- dbReadTable(con, "iris") + res$Species <- as.factor(res$Species) + expect_true(identical(res, iris)) + dbRemoveTable(con, "iris") + + }) + + it("lower.case.names works as expected with col.types named vector", { + + write.csv(iris, tf, row.names = FALSE) + duckdb_read_csv(con, "iris", tf, + col.types = c( + S.lEngth = "DOUBLE", + S.wiDth = "DOUBLE", + p.leNgth = "DOUBLE", + p.Width = "DOUBLE", + spEc = "VARCHAR" + ), lower.case.names = TRUE) + + res <- dbReadTable(con, "iris") + res$spec <- as.factor(res$spec) + iris_renamed <- setNames(iris, tolower(c("s.length", "s.width", "p.length", "p.width", "spec"))) + expect_true(identical(res, iris_renamed)) + dbRemoveTable(con, "iris") + + }) + + it("error when col.types length not equal to number of cols in file", { + + write.csv(iris, tf, row.names = FALSE) + expect_error(duckdb_read_csv(con, "iris", tf, col.types = c(rep("VARCHAR", 4)))) + dbRemoveTable(con, "iris") + + }) + + it("invalid col.types gives error", { + + write.csv(iris, tf, row.names = FALSE) + expect_error( + duckdb_read_csv(con, "iris", tf, + col.types = c( + Sepal.Length = "DOUBLE", + Sepal.Width = "DOUBLE", + Petal.Length = "DOUBLE", + Petal.Width = "DOUBLE", + Species = "DOUBLE" + ) + ) + ) + }) + + it("test date types works as expected", { + + dates_df <- data.frame(dates = as.Date(seq(1:10), origin = '2020-01-01')) + write.csv(dates_df, tf, row.names = FALSE) + duckdb_read_csv(con, "dates_test", tf, col.types = c(dates = 'DATE')) + + res <- dbReadTable(con, "dates_test") + expect_true(identical(res, dates_df)) + dbRemoveTable(con, "dates_test") # Corrected table name + + }) + + dbDisconnect(con, shutdown = TRUE) + +})