Skip to content

Commit

Permalink
feat: Add col.types argument to duckdb_read_csv() (#445)
Browse files Browse the repository at this point in the history
* add col.types to duckdb_read_csv

* ref :  duckdb_read_csv

* feat : add dates test to read_csv_duckdb

* post check addition

* refactor tests for more insightful test messages

* Apply suggestions from code review

* fix: name to names

* doc: add link to duckdb data type docs

* Formatting

* Document

---------

Co-authored-by: Kirill Müller <krlmlr@users.noreply.github.com>
Co-authored-by: Kirill Müller <kirill@cynkra.com>
  • Loading branch information
3 people authored Oct 28, 2024
1 parent 1e300e7 commit 7c8f306
Show file tree
Hide file tree
Showing 3 changed files with 243 additions and 20 deletions.
108 changes: 89 additions & 19 deletions R/csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#' Directly reads a CSV file into DuckDB, tries to detect and create the correct schema for it.
#' This usually is much faster than reading the data into R and writing it to DuckDB.
#'
#' If the table already exists in the database, the csv is appended to it. Otherwise the table is created.
#'
#' @inheritParams duckdb_register
#' @param files One or more CSV file names, should all have the same structure though
#' @param ... Reserved for future extensions, must be empty.
Expand All @@ -12,6 +14,9 @@
#' @param delim Which field separator should be used
#' @param quote Which quote character is used for columns in the CSV file
#' @param col.names Override the detected or generated column names
#' @param col.types Character vector of column types in the same order as col.names,
#' or a named character vector where names are column names and types pairs.
#' Valid types are \href{https://duckdb.org/docs/sql/data_types/overview.html}{DuckDB data types}, e.g. VARCHAR, DOUBLE, DATE, BIGINT, BOOLEAN, etc.
#' @param lower.case.names Transform column names to lower case
#' @param sep Alias for delim for compatibility
#' @param transaction Should a transaction be used for the entire operation
Expand All @@ -30,6 +35,25 @@
#' dbReadTable(con, "data")
#'
#' dbDisconnect(con)
#'
#'
#' # Providing data types for columns
#' path <- tempfile(fileext = ".csv")
#' write.csv(iris, path, row.names = FALSE)
#'
#' con <- dbConnect(duckdb())
#' duckdb_read_csv(con, "iris", path,
#' col.types = c(
#' Sepal.Length = "DOUBLE",
#' Sepal.Width = "DOUBLE",
#' Petal.Length = "DOUBLE",
#' Petal.Width = "DOUBLE",
#' Species = "VARCHAR"
#' )
#' )
#' dbReadTable(con, "iris")
#' dbDisconnect(con)
#'
duckdb_read_csv <- function(
conn,
name,
Expand All @@ -41,6 +65,7 @@ duckdb_read_csv <- function(
delim = ",",
quote = "\"",
col.names = NULL,
col.types = NULL,
lower.case.names = FALSE,
sep = delim,
transaction = TRUE,
Expand All @@ -51,16 +76,25 @@ duckdb_read_csv <- function(
if (length(na.strings) > 1) stop("na.strings must be of length 1")
if (!missing(sep)) delim <- sep

headers <- lapply(files, utils::read.csv, sep = delim, na.strings = na.strings, quote = quote, nrows = nrow.check, header = header, ...)
headers <- lapply(files, utils::read.csv,
sep = delim, na.strings = na.strings,
quote = quote, nrows = nrow.check, header = header, ...
)
if (length(files) > 1) {
nn <- sapply(headers, ncol)
if (!all(nn == nn[1])) stop("Files have different numbers of columns")
nms <- sapply(headers, names)
if (!all(nms == nms[, 1])) stop("Files have different variable names")
types <- sapply(headers, function(df) sapply(df, dbDataType, dbObj = conn))
if (!all(types == types[, 1])) stop("Files have different variable types")
if (!all(nms == nms[, 1])) stop("Files have different variable names or order")
if (is.null(col.types)) {
types <- sapply(headers, function(df) sapply(df, dbDataType, dbObj = conn))
if (!all(types == types[, 1])) stop("Files have different variable types")
}
}

fields <- set_csv_fields(found = headers[[1]][FALSE, , drop = FALSE], col.names, col.types)

if (lower.case.names) { names(fields) <- tolower(names(fields)) }

if (transaction) {
dbBegin(conn)
on.exit(tryCatch(dbRollback(conn), error = function(e) {}))
Expand All @@ -69,21 +103,7 @@ duckdb_read_csv <- function(
tablename <- dbQuoteIdentifier(conn, name)

if (!dbExistsTable(conn, tablename)) {
if (lower.case.names) names(headers[[1]]) <- tolower(names(headers[[1]]))
if (!is.null(col.names)) {
if (lower.case.names) {
warning("Ignoring lower.case.names parameter as overriding col.names are supplied.")
}
col.names <- as.character(col.names)
if (length(unique(col.names)) != length(names(headers[[1]]))) {
stop(
"You supplied ", length(unique(col.names)), " unique column names, but file has ",
length(names(headers[[1]])), " columns."
)
}
names(headers[[1]]) <- col.names
}
dbCreateTable(conn, tablename, headers[[1]], temporary = temporary)
dbCreateTable(conn, tablename, fields, temporary = temporary)
}

for (i in seq_along(files)) {
Expand All @@ -100,6 +120,56 @@ duckdb_read_csv <- function(
invisible(out)
}


#' Column names and types logic for duckdb_read_csv()
#'
#' @param found the detected (found) header and types from `utils::read_csv`
#' @param col.names user provided column names
#' @param col.types user provider column types and maybe names too
#'
#' @noRd
#' @return returns a valid fields argument for `dbCreateTable`
set_csv_fields <- function(found, col.names, col.types) {
if (is.null(col.types) && is.null(col.types)) {
return(found)
}

if (!is.null(names(col.types)) && !is.null(col.names)) {
warning("Ignoring `col.names` as column names provided by `col.types` parameter")
return(col.types)
}

if (!is.null(col.types)) {
if (length(col.types) != ncol(found)) {
stop(
"You supplied ", length(col.types), " values to `col.names`, but file has ",
ncol(found), " columns."
)
}

if (!is.null(names(col.types))) {
return(col.types)
} else {
if (length(col.types) != ncol(found)) {
stop(
"You supplied ", length(col.types), " values to `col.types`, but file has ",
ncol(found), " columns."
)
}
fields <- col.types
names(fields) <- col.names
return(fields)
}
} else {
fields <- col.types
names(fields) <- names(found)
}
fields
}




#' Deprecated functions
#'
#' `read_csv_duckdb()` has been superseded by `duckdb_read_csv()`.
Expand Down
26 changes: 26 additions & 0 deletions man/duckdb_read_csv.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

129 changes: 128 additions & 1 deletion tests/testthat/test-read.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ test_that("duckdb_read_csv() works as expected", {
con <- dbConnect(duckdb())

tf <- tempfile()
tf2 <- tempfile()

# default case
write.csv(iris, tf, row.names = FALSE)
Expand Down Expand Up @@ -117,7 +118,7 @@ test_that("duckdb_read_csv() works as expected", {
'"num","char","logi","lisst.1","lisst.2","lisst.3","lisst.NA"',
'0.5,"yes",TRUE,1,2,3,NA',
'2,"no",FALSE,1,2,3,NA',
'NA,NA,NA,1,2,3,NA'
"NA,NA,NA,1,2,3,NA"
)
writeLines(csv, tf3)
duckdb_read_csv(con, "na_table", tf3, na.strings = "-")
Expand All @@ -140,3 +141,129 @@ test_that("duckdb_read_csv() works as expected", {

dbDisconnect(con, shutdown = TRUE)
})

describe("duckdb_read_csv", {

skip_if_not(TEST_RE2)
tf <- tempfile()
con <- dbConnect(duckdb())

it("col.types arg works with vector of types and inferred colnames", {

# Case with col.types as character vector
write.csv(iris, tf, row.names = FALSE)
duckdb_read_csv(con, "iris", tf,
col.types = c(
"DOUBLE",
"DOUBLE",
"DOUBLE",
"DOUBLE",
"VARCHAR"
)
)

res <- dbReadTable(con, "iris")
res$Species <- as.factor(res$Species)
expect_true(identical(res, iris))
dbRemoveTable(con, "iris")

})

it("col.types and col.names work together when unnamed", {

write.csv(iris, tf, row.names = FALSE)
duckdb_read_csv(
con, "iris", tf,
col.names = c("S.Length", "S.Width", "P.Length", "P.Width", "Species"),
col.types = c("DOUBLE", "DOUBLE", "DOUBLE", "DOUBLE", "VARCHAR")
)

res <- dbReadTable(con, "iris")
res$Species <- as.factor(res$Species)
iris_renamed <- setNames(iris, c("S.Length", "S.Width", "P.Length", "P.Width", "Species"))
expect_true(identical(res, iris_renamed))
dbRemoveTable(con, "iris")

})

it("col.types overwrites col.names when col.types is named", {

write.csv(iris, tf, row.names = FALSE)
expect_warning(
duckdb_read_csv(con, "iris", tf,
col.names = c("A", "B", "C", "D", "E"),
col.types = c(
Sepal.Length = "DOUBLE",
Sepal.Width = "DOUBLE",
Petal.Length = "DOUBLE",
Petal.Width = "DOUBLE",
Species = "VARCHAR"
)
)
)
res <- dbReadTable(con, "iris")
res$Species <- as.factor(res$Species)
expect_true(identical(res, iris))
dbRemoveTable(con, "iris")

})

it("lower.case.names works as expected with col.types named vector", {

write.csv(iris, tf, row.names = FALSE)
duckdb_read_csv(con, "iris", tf,
col.types = c(
S.lEngth = "DOUBLE",
S.wiDth = "DOUBLE",
p.leNgth = "DOUBLE",
p.Width = "DOUBLE",
spEc = "VARCHAR"
), lower.case.names = TRUE)

res <- dbReadTable(con, "iris")
res$spec <- as.factor(res$spec)
iris_renamed <- setNames(iris, tolower(c("s.length", "s.width", "p.length", "p.width", "spec")))
expect_true(identical(res, iris_renamed))
dbRemoveTable(con, "iris")

})

it("error when col.types length not equal to number of cols in file", {

write.csv(iris, tf, row.names = FALSE)
expect_error(duckdb_read_csv(con, "iris", tf, col.types = c(rep("VARCHAR", 4))))
dbRemoveTable(con, "iris")

})

it("invalid col.types gives error", {

write.csv(iris, tf, row.names = FALSE)
expect_error(
duckdb_read_csv(con, "iris", tf,
col.types = c(
Sepal.Length = "DOUBLE",
Sepal.Width = "DOUBLE",
Petal.Length = "DOUBLE",
Petal.Width = "DOUBLE",
Species = "DOUBLE"
)
)
)
})

it("test date types works as expected", {

dates_df <- data.frame(dates = as.Date(seq(1:10), origin = '2020-01-01'))
write.csv(dates_df, tf, row.names = FALSE)
duckdb_read_csv(con, "dates_test", tf, col.types = c(dates = 'DATE'))

res <- dbReadTable(con, "dates_test")
expect_true(identical(res, dates_df))
dbRemoveTable(con, "dates_test") # Corrected table name

})

dbDisconnect(con, shutdown = TRUE)

})

0 comments on commit 7c8f306

Please sign in to comment.