feat: Add col.types argument to duckdb_read_csv() (#445)

* add col.types to duckdb_read_csv * ref : duckdb_read_csv * feat : add dates test to read_csv_duckdb * post check addition * refactor tests for more insightful test messages * Apply suggestions from code review * fix: name to names * doc: add link to duckdb data type docs * Formatting * Document --------- Co-authored-by: Kirill Müller <krlmlr@users.noreply.github.com> Co-authored-by: Kirill Müller <kirill@cynkra.com>
duckdb · Oct 28, 2024 · 7c8f306 · 7c8f306
1 parent 1e300e7
commit 7c8f306
Show file tree

Hide file tree

Showing 3 changed files with 243 additions and 20 deletions.
diff --git a/R/csv.R b/R/csv.R
@@ -3,6 +3,8 @@
 #' Directly reads a CSV file into DuckDB, tries to detect and create the correct schema for it.
 #' This usually is much faster than reading the data into R and writing it to DuckDB.
 #'
+#' If the table already exists in the database, the csv is appended to it. Otherwise the table is created.
+#'
 #' @inheritParams duckdb_register
 #' @param files One or more CSV file names, should all have the same structure though
 #' @param ... Reserved for future extensions, must be empty.
@@ -12,6 +14,9 @@
 #' @param delim Which field separator should be used
 #' @param quote Which quote character is used for columns in the CSV file
 #' @param col.names Override the detected or generated column names
+#' @param col.types Character vector of column types in the same order as col.names,
+#' or a named character vector where names are column names and types pairs.
+#' Valid types are \href{https://duckdb.org/docs/sql/data_types/overview.html}{DuckDB data types}, e.g. VARCHAR, DOUBLE, DATE, BIGINT, BOOLEAN, etc.
 #' @param lower.case.names Transform column names to lower case
 #' @param sep Alias for delim for compatibility
 #' @param transaction Should a transaction be used for the entire operation
@@ -30,6 +35,25 @@
 #' dbReadTable(con, "data")
 #'
 #' dbDisconnect(con)
+#'
+#'
+#' # Providing data types for columns
+#' path <- tempfile(fileext = ".csv")
+#' write.csv(iris, path, row.names = FALSE)
+#'
+#' con <- dbConnect(duckdb())
+#' duckdb_read_csv(con, "iris", path,
+#'   col.types = c(
+#'     Sepal.Length = "DOUBLE",
+#'     Sepal.Width = "DOUBLE",
+#'     Petal.Length = "DOUBLE",
+#'     Petal.Width = "DOUBLE",
+#'     Species = "VARCHAR"
+#'   )
+#' )
+#' dbReadTable(con, "iris")
+#' dbDisconnect(con)
+#'
 duckdb_read_csv <- function(
   conn,
   name,
@@ -41,6 +65,7 @@ duckdb_read_csv <- function(
   delim = ",",
   quote = "\"",
   col.names = NULL,
+  col.types = NULL,
   lower.case.names = FALSE,
   sep = delim,
   transaction = TRUE,
@@ -51,16 +76,25 @@ duckdb_read_csv <- function(
   if (length(na.strings) > 1) stop("na.strings must be of length 1")
   if (!missing(sep)) delim <- sep
 
-  headers <- lapply(files, utils::read.csv, sep = delim, na.strings = na.strings, quote = quote, nrows = nrow.check, header = header, ...)
+  headers <- lapply(files, utils::read.csv,
+    sep = delim, na.strings = na.strings,
+    quote = quote, nrows = nrow.check, header = header, ...
+  )
   if (length(files) > 1) {
     nn <- sapply(headers, ncol)
     if (!all(nn == nn[1])) stop("Files have different numbers of columns")
     nms <- sapply(headers, names)
-    if (!all(nms == nms[, 1])) stop("Files have different variable names")
-    types <- sapply(headers, function(df) sapply(df, dbDataType, dbObj = conn))
-    if (!all(types == types[, 1])) stop("Files have different variable types")
+    if (!all(nms == nms[, 1])) stop("Files have different variable names or order")
+    if (is.null(col.types)) {
+      types <- sapply(headers, function(df) sapply(df, dbDataType, dbObj = conn))
+      if (!all(types == types[, 1])) stop("Files have different variable types")
+    }
   }
 
+  fields <- set_csv_fields(found = headers[[1]][FALSE, , drop = FALSE], col.names, col.types)
+
+  if (lower.case.names) { names(fields) <- tolower(names(fields)) }
+
   if (transaction) {
     dbBegin(conn)
     on.exit(tryCatch(dbRollback(conn), error = function(e) {}))
@@ -69,21 +103,7 @@ duckdb_read_csv <- function(
   tablename <- dbQuoteIdentifier(conn, name)
 
   if (!dbExistsTable(conn, tablename)) {
-    if (lower.case.names) names(headers[[1]]) <- tolower(names(headers[[1]]))
-    if (!is.null(col.names)) {
-      if (lower.case.names) {
-        warning("Ignoring lower.case.names parameter as overriding col.names are supplied.")
-      }
-      col.names <- as.character(col.names)
-      if (length(unique(col.names)) != length(names(headers[[1]]))) {
-        stop(
-          "You supplied ", length(unique(col.names)), " unique column names, but file has ",
-          length(names(headers[[1]])), " columns."
-        )
-      }
-      names(headers[[1]]) <- col.names
-    }
-    dbCreateTable(conn, tablename, headers[[1]], temporary = temporary)
+    dbCreateTable(conn, tablename, fields, temporary = temporary)
   }
 
   for (i in seq_along(files)) {
@@ -100,6 +120,56 @@ duckdb_read_csv <- function(
   invisible(out)
 }
 
+
+#' Column names and types logic for duckdb_read_csv()
+#'
+#' @param found the detected (found) header and types from `utils::read_csv`
+#' @param col.names user provided column names
+#' @param col.types user provider column types and maybe names too
+#'
+#' @noRd
+#' @return returns a valid fields argument for `dbCreateTable`
+set_csv_fields <- function(found, col.names, col.types) {
+  if (is.null(col.types) && is.null(col.types)) {
+    return(found)
+  }
+
+  if (!is.null(names(col.types)) && !is.null(col.names)) {
+    warning("Ignoring `col.names` as column names provided by `col.types` parameter")
+    return(col.types)
+  }
+
+  if (!is.null(col.types)) {
+    if (length(col.types) != ncol(found)) {
+      stop(
+        "You supplied ", length(col.types), " values to `col.names`, but file has ",
+        ncol(found), " columns."
+      )
+    }
+
+    if (!is.null(names(col.types))) {
+      return(col.types)
+    } else {
+      if (length(col.types) != ncol(found)) {
+        stop(
+          "You supplied ", length(col.types), " values to `col.types`, but file has ",
+          ncol(found), " columns."
+        )
+      }
+      fields <- col.types
+      names(fields) <- col.names
+      return(fields)
+    }
+  } else {
+    fields <- col.types
+    names(fields) <- names(found)
+  }
+  fields
+}
+
+
+
+
 #' Deprecated functions
 #'
 #' `read_csv_duckdb()` has been superseded by `duckdb_read_csv()`.

diff --git a/man/duckdb_read_csv.Rd b/man/duckdb_read_csv.Rd
diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R
@@ -4,6 +4,7 @@ test_that("duckdb_read_csv() works as expected", {
   con <- dbConnect(duckdb())
 
   tf <- tempfile()
+  tf2 <- tempfile()
 
   # default case
   write.csv(iris, tf, row.names = FALSE)
@@ -117,7 +118,7 @@ test_that("duckdb_read_csv() works as expected", {
     '"num","char","logi","lisst.1","lisst.2","lisst.3","lisst.NA"',
     '0.5,"yes",TRUE,1,2,3,NA',
     '2,"no",FALSE,1,2,3,NA',
-    'NA,NA,NA,1,2,3,NA'
+    "NA,NA,NA,1,2,3,NA"
   )
   writeLines(csv, tf3)
   duckdb_read_csv(con, "na_table", tf3, na.strings = "-")
@@ -140,3 +141,129 @@ test_that("duckdb_read_csv() works as expected", {
 
   dbDisconnect(con, shutdown = TRUE)
 })
+
+describe("duckdb_read_csv", {
+
+  skip_if_not(TEST_RE2)
+  tf <- tempfile()
+  con <- dbConnect(duckdb())
+
+  it("col.types arg works with vector of types and inferred colnames", {
+
+    # Case with col.types as character vector
+    write.csv(iris, tf, row.names = FALSE)
+    duckdb_read_csv(con, "iris", tf,
+                    col.types = c(
+                      "DOUBLE",
+                      "DOUBLE",
+                      "DOUBLE",
+                      "DOUBLE",
+                      "VARCHAR"
+                    )
+    )
+
+    res <- dbReadTable(con, "iris")
+    res$Species <- as.factor(res$Species)
+    expect_true(identical(res, iris))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("col.types and col.names work together when unnamed", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    duckdb_read_csv(
+      con, "iris", tf,
+      col.names = c("S.Length", "S.Width", "P.Length", "P.Width", "Species"),
+      col.types = c("DOUBLE", "DOUBLE", "DOUBLE", "DOUBLE", "VARCHAR")
+    )
+
+    res <- dbReadTable(con, "iris")
+    res$Species <- as.factor(res$Species)
+    iris_renamed <- setNames(iris, c("S.Length", "S.Width", "P.Length", "P.Width", "Species"))
+    expect_true(identical(res, iris_renamed))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("col.types overwrites col.names when col.types is named", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    expect_warning(
+      duckdb_read_csv(con, "iris", tf,
+                      col.names = c("A", "B", "C", "D", "E"),
+                      col.types = c(
+                        Sepal.Length = "DOUBLE",
+                        Sepal.Width = "DOUBLE",
+                        Petal.Length = "DOUBLE",
+                        Petal.Width = "DOUBLE",
+                        Species = "VARCHAR"
+                      )
+      )
+    )
+    res <- dbReadTable(con, "iris")
+    res$Species <- as.factor(res$Species)
+    expect_true(identical(res, iris))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("lower.case.names works as expected with col.types named vector", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    duckdb_read_csv(con, "iris", tf,
+                    col.types = c(
+                      S.lEngth = "DOUBLE",
+                      S.wiDth = "DOUBLE",
+                      p.leNgth = "DOUBLE",
+                      p.Width = "DOUBLE",
+                      spEc = "VARCHAR"
+                    ), lower.case.names = TRUE)
+
+    res <- dbReadTable(con, "iris")
+    res$spec <- as.factor(res$spec)
+    iris_renamed <- setNames(iris, tolower(c("s.length", "s.width", "p.length", "p.width", "spec")))
+    expect_true(identical(res, iris_renamed))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("error when col.types length not equal to number of cols in file", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    expect_error(duckdb_read_csv(con, "iris", tf, col.types = c(rep("VARCHAR", 4))))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("invalid col.types gives error", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    expect_error(
+      duckdb_read_csv(con, "iris", tf,
+                      col.types = c(
+                        Sepal.Length = "DOUBLE",
+                        Sepal.Width = "DOUBLE",
+                        Petal.Length = "DOUBLE",
+                        Petal.Width = "DOUBLE",
+                        Species = "DOUBLE"
+                      )
+      )
+    )
+  })
+
+  it("test date types works as expected", {
+
+    dates_df <- data.frame(dates = as.Date(seq(1:10), origin = '2020-01-01'))
+    write.csv(dates_df, tf, row.names = FALSE)
+    duckdb_read_csv(con, "dates_test", tf, col.types = c(dates = 'DATE'))
+
+    res <- dbReadTable(con, "dates_test")
+    expect_true(identical(res, dates_df))
+    dbRemoveTable(con, "dates_test")  # Corrected table name
+
+  })
+
+  dbDisconnect(con, shutdown = TRUE)
+
+})