From 7c8f30695745aa304b73b91bfe8c397cb8eb7677 Mon Sep 17 00:00:00 2001
From: Eli Daniels <90638398+eli-daniels@users.noreply.github.com>
Date: Mon, 28 Oct 2024 18:51:16 +0100
Subject: [PATCH] feat: Add `col.types` argument to `duckdb_read_csv()` (#445)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add col.types to duckdb_read_csv

* ref :  duckdb_read_csv

* feat : add dates test to read_csv_duckdb

* post check addition

* refactor tests for more insightful test messages

* Apply suggestions from code review

* fix: name to names

* doc: add link to duckdb data type docs

* Formatting

* Document

---------

Co-authored-by: Kirill Müller <krlmlr@users.noreply.github.com>
Co-authored-by: Kirill Müller <kirill@cynkra.com>
---
 R/csv.R                    | 108 +++++++++++++++++++++++++------
 man/duckdb_read_csv.Rd     |  26 ++++++++
 tests/testthat/test-read.R | 129 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 243 insertions(+), 20 deletions(-)

diff --git a/R/csv.R b/R/csv.R
index 5debde7ac..e8bcc244b 100644
--- a/R/csv.R
+++ b/R/csv.R
@@ -3,6 +3,8 @@
 #' Directly reads a CSV file into DuckDB, tries to detect and create the correct schema for it.
 #' This usually is much faster than reading the data into R and writing it to DuckDB.
 #'
+#' If the table already exists in the database, the csv is appended to it. Otherwise the table is created.
+#'
 #' @inheritParams duckdb_register
 #' @param files One or more CSV file names, should all have the same structure though
 #' @param ... Reserved for future extensions, must be empty.
@@ -12,6 +14,9 @@
 #' @param delim Which field separator should be used
 #' @param quote Which quote character is used for columns in the CSV file
 #' @param col.names Override the detected or generated column names
+#' @param col.types Character vector of column types in the same order as col.names,
+#' or a named character vector where names are column names and types pairs.
+#' Valid types are \href{https://duckdb.org/docs/sql/data_types/overview.html}{DuckDB data types}, e.g. VARCHAR, DOUBLE, DATE, BIGINT, BOOLEAN, etc.
 #' @param lower.case.names Transform column names to lower case
 #' @param sep Alias for delim for compatibility
 #' @param transaction Should a transaction be used for the entire operation
@@ -30,6 +35,25 @@
 #' dbReadTable(con, "data")
 #'
 #' dbDisconnect(con)
+#'
+#'
+#' # Providing data types for columns
+#' path <- tempfile(fileext = ".csv")
+#' write.csv(iris, path, row.names = FALSE)
+#'
+#' con <- dbConnect(duckdb())
+#' duckdb_read_csv(con, "iris", path,
+#'   col.types = c(
+#'     Sepal.Length = "DOUBLE",
+#'     Sepal.Width = "DOUBLE",
+#'     Petal.Length = "DOUBLE",
+#'     Petal.Width = "DOUBLE",
+#'     Species = "VARCHAR"
+#'   )
+#' )
+#' dbReadTable(con, "iris")
+#' dbDisconnect(con)
+#'
 duckdb_read_csv <- function(
   conn,
   name,
@@ -41,6 +65,7 @@ duckdb_read_csv <- function(
   delim = ",",
   quote = "\"",
   col.names = NULL,
+  col.types = NULL,
   lower.case.names = FALSE,
   sep = delim,
   transaction = TRUE,
@@ -51,16 +76,25 @@ duckdb_read_csv <- function(
   if (length(na.strings) > 1) stop("na.strings must be of length 1")
   if (!missing(sep)) delim <- sep
 
-  headers <- lapply(files, utils::read.csv, sep = delim, na.strings = na.strings, quote = quote, nrows = nrow.check, header = header, ...)
+  headers <- lapply(files, utils::read.csv,
+    sep = delim, na.strings = na.strings,
+    quote = quote, nrows = nrow.check, header = header, ...
+  )
   if (length(files) > 1) {
     nn <- sapply(headers, ncol)
     if (!all(nn == nn[1])) stop("Files have different numbers of columns")
     nms <- sapply(headers, names)
-    if (!all(nms == nms[, 1])) stop("Files have different variable names")
-    types <- sapply(headers, function(df) sapply(df, dbDataType, dbObj = conn))
-    if (!all(types == types[, 1])) stop("Files have different variable types")
+    if (!all(nms == nms[, 1])) stop("Files have different variable names or order")
+    if (is.null(col.types)) {
+      types <- sapply(headers, function(df) sapply(df, dbDataType, dbObj = conn))
+      if (!all(types == types[, 1])) stop("Files have different variable types")
+    }
   }
 
+  fields <- set_csv_fields(found = headers[[1]][FALSE, , drop = FALSE], col.names, col.types)
+
+  if (lower.case.names) { names(fields) <- tolower(names(fields)) }
+
   if (transaction) {
     dbBegin(conn)
     on.exit(tryCatch(dbRollback(conn), error = function(e) {}))
@@ -69,21 +103,7 @@ duckdb_read_csv <- function(
   tablename <- dbQuoteIdentifier(conn, name)
 
   if (!dbExistsTable(conn, tablename)) {
-    if (lower.case.names) names(headers[[1]]) <- tolower(names(headers[[1]]))
-    if (!is.null(col.names)) {
-      if (lower.case.names) {
-        warning("Ignoring lower.case.names parameter as overriding col.names are supplied.")
-      }
-      col.names <- as.character(col.names)
-      if (length(unique(col.names)) != length(names(headers[[1]]))) {
-        stop(
-          "You supplied ", length(unique(col.names)), " unique column names, but file has ",
-          length(names(headers[[1]])), " columns."
-        )
-      }
-      names(headers[[1]]) <- col.names
-    }
-    dbCreateTable(conn, tablename, headers[[1]], temporary = temporary)
+    dbCreateTable(conn, tablename, fields, temporary = temporary)
   }
 
   for (i in seq_along(files)) {
@@ -100,6 +120,56 @@ duckdb_read_csv <- function(
   invisible(out)
 }
 
+
+#' Column names and types logic for duckdb_read_csv()
+#'
+#' @param found the detected (found) header and types from `utils::read_csv`
+#' @param col.names user provided column names
+#' @param col.types user provider column types and maybe names too
+#'
+#' @noRd
+#' @return returns a valid fields argument for `dbCreateTable`
+set_csv_fields <- function(found, col.names, col.types) {
+  if (is.null(col.types) && is.null(col.types)) {
+    return(found)
+  }
+
+  if (!is.null(names(col.types)) && !is.null(col.names)) {
+    warning("Ignoring `col.names` as column names provided by `col.types` parameter")
+    return(col.types)
+  }
+
+  if (!is.null(col.types)) {
+    if (length(col.types) != ncol(found)) {
+      stop(
+        "You supplied ", length(col.types), " values to `col.names`, but file has ",
+        ncol(found), " columns."
+      )
+    }
+
+    if (!is.null(names(col.types))) {
+      return(col.types)
+    } else {
+      if (length(col.types) != ncol(found)) {
+        stop(
+          "You supplied ", length(col.types), " values to `col.types`, but file has ",
+          ncol(found), " columns."
+        )
+      }
+      fields <- col.types
+      names(fields) <- col.names
+      return(fields)
+    }
+  } else {
+    fields <- col.types
+    names(fields) <- names(found)
+  }
+  fields
+}
+
+
+
+
 #' Deprecated functions
 #'
 #' `read_csv_duckdb()` has been superseded by `duckdb_read_csv()`.
diff --git a/man/duckdb_read_csv.Rd b/man/duckdb_read_csv.Rd
index 0508ae1cd..24707a8cf 100644
--- a/man/duckdb_read_csv.Rd
+++ b/man/duckdb_read_csv.Rd
@@ -15,6 +15,7 @@ duckdb_read_csv(
   delim = ",",
   quote = "\\"",
   col.names = NULL,
+  col.types = NULL,
   lower.case.names = FALSE,
   sep = delim,
   transaction = TRUE,
@@ -42,6 +43,10 @@ duckdb_read_csv(
 
 \item{col.names}{Override the detected or generated column names}
 
+\item{col.types}{Character vector of column types in the same order as col.names,
+or a named character vector where names are column names and types pairs.
+Valid types are \href{https://duckdb.org/docs/sql/data_types/overview.html}{DuckDB data types}, e.g. VARCHAR, DOUBLE, DATE, BIGINT, BOOLEAN, etc.}
+
 \item{lower.case.names}{Transform column names to lower case}
 
 \item{sep}{Alias for delim for compatibility}
@@ -57,6 +62,9 @@ The number of rows in the resulted table, invisibly.
 Directly reads a CSV file into DuckDB, tries to detect and create the correct schema for it.
 This usually is much faster than reading the data into R and writing it to DuckDB.
 }
+\details{
+If the table already exists in the database, the csv is appended to it. Otherwise the table is created.
+}
 \examples{
 \dontshow{if (duckdb:::TEST_RE2) withAutoprint(\{ # examplesIf}
 con <- dbConnect(duckdb())
@@ -70,5 +78,23 @@ duckdb_read_csv(con, "data", path)
 dbReadTable(con, "data")
 
 dbDisconnect(con)
+
+
+# Providing data types for columns
+path <- tempfile(fileext = ".csv")
+write.csv(iris, path, row.names = FALSE)
+
+con <- dbConnect(duckdb())
+duckdb_read_csv(con, "iris", path,
+  col.types = c(
+    Sepal.Length = "DOUBLE",
+    Sepal.Width = "DOUBLE",
+    Petal.Length = "DOUBLE",
+    Petal.Width = "DOUBLE",
+    Species = "VARCHAR"
+  )
+)
+dbReadTable(con, "iris")
+dbDisconnect(con)
 \dontshow{\}) # examplesIf}
 }
diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R
index 8f1c82627..dff183496 100644
--- a/tests/testthat/test-read.R
+++ b/tests/testthat/test-read.R
@@ -4,6 +4,7 @@ test_that("duckdb_read_csv() works as expected", {
   con <- dbConnect(duckdb())
 
   tf <- tempfile()
+  tf2 <- tempfile()
 
   # default case
   write.csv(iris, tf, row.names = FALSE)
@@ -117,7 +118,7 @@ test_that("duckdb_read_csv() works as expected", {
     '"num","char","logi","lisst.1","lisst.2","lisst.3","lisst.NA"',
     '0.5,"yes",TRUE,1,2,3,NA',
     '2,"no",FALSE,1,2,3,NA',
-    'NA,NA,NA,1,2,3,NA'
+    "NA,NA,NA,1,2,3,NA"
   )
   writeLines(csv, tf3)
   duckdb_read_csv(con, "na_table", tf3, na.strings = "-")
@@ -140,3 +141,129 @@ test_that("duckdb_read_csv() works as expected", {
 
   dbDisconnect(con, shutdown = TRUE)
 })
+
+describe("duckdb_read_csv", {
+
+  skip_if_not(TEST_RE2)
+  tf <- tempfile()
+  con <- dbConnect(duckdb())
+
+  it("col.types arg works with vector of types and inferred colnames", {
+
+    # Case with col.types as character vector
+    write.csv(iris, tf, row.names = FALSE)
+    duckdb_read_csv(con, "iris", tf,
+                    col.types = c(
+                      "DOUBLE",
+                      "DOUBLE",
+                      "DOUBLE",
+                      "DOUBLE",
+                      "VARCHAR"
+                    )
+    )
+
+    res <- dbReadTable(con, "iris")
+    res$Species <- as.factor(res$Species)
+    expect_true(identical(res, iris))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("col.types and col.names work together when unnamed", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    duckdb_read_csv(
+      con, "iris", tf,
+      col.names = c("S.Length", "S.Width", "P.Length", "P.Width", "Species"),
+      col.types = c("DOUBLE", "DOUBLE", "DOUBLE", "DOUBLE", "VARCHAR")
+    )
+
+    res <- dbReadTable(con, "iris")
+    res$Species <- as.factor(res$Species)
+    iris_renamed <- setNames(iris, c("S.Length", "S.Width", "P.Length", "P.Width", "Species"))
+    expect_true(identical(res, iris_renamed))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("col.types overwrites col.names when col.types is named", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    expect_warning(
+      duckdb_read_csv(con, "iris", tf,
+                      col.names = c("A", "B", "C", "D", "E"),
+                      col.types = c(
+                        Sepal.Length = "DOUBLE",
+                        Sepal.Width = "DOUBLE",
+                        Petal.Length = "DOUBLE",
+                        Petal.Width = "DOUBLE",
+                        Species = "VARCHAR"
+                      )
+      )
+    )
+    res <- dbReadTable(con, "iris")
+    res$Species <- as.factor(res$Species)
+    expect_true(identical(res, iris))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("lower.case.names works as expected with col.types named vector", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    duckdb_read_csv(con, "iris", tf,
+                    col.types = c(
+                      S.lEngth = "DOUBLE",
+                      S.wiDth = "DOUBLE",
+                      p.leNgth = "DOUBLE",
+                      p.Width = "DOUBLE",
+                      spEc = "VARCHAR"
+                    ), lower.case.names = TRUE)
+
+    res <- dbReadTable(con, "iris")
+    res$spec <- as.factor(res$spec)
+    iris_renamed <- setNames(iris, tolower(c("s.length", "s.width", "p.length", "p.width", "spec")))
+    expect_true(identical(res, iris_renamed))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("error when col.types length not equal to number of cols in file", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    expect_error(duckdb_read_csv(con, "iris", tf, col.types = c(rep("VARCHAR", 4))))
+    dbRemoveTable(con, "iris")
+
+  })
+
+  it("invalid col.types gives error", {
+
+    write.csv(iris, tf, row.names = FALSE)
+    expect_error(
+      duckdb_read_csv(con, "iris", tf,
+                      col.types = c(
+                        Sepal.Length = "DOUBLE",
+                        Sepal.Width = "DOUBLE",
+                        Petal.Length = "DOUBLE",
+                        Petal.Width = "DOUBLE",
+                        Species = "DOUBLE"
+                      )
+      )
+    )
+  })
+
+  it("test date types works as expected", {
+
+    dates_df <- data.frame(dates = as.Date(seq(1:10), origin = '2020-01-01'))
+    write.csv(dates_df, tf, row.names = FALSE)
+    duckdb_read_csv(con, "dates_test", tf, col.types = c(dates = 'DATE'))
+
+    res <- dbReadTable(con, "dates_test")
+    expect_true(identical(res, dates_df))
+    dbRemoveTable(con, "dates_test")  # Corrected table name
+
+  })
+
+  dbDisconnect(con, shutdown = TRUE)
+
+})