Skip to content

Commit

Permalink
Merge pull request #211 from duckdb/f-133-tbl
Browse files Browse the repository at this point in the history
  • Loading branch information
krlmlr authored Aug 17, 2024
2 parents 7c94631 + 79b6462 commit e77abbd
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 56 deletions.
1 change: 0 additions & 1 deletion .github/dep-suggests-matrix.json
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
{"package":["adbcdrivermanager","arrow","bit64","clock","DBItest","dbplyr","dplyr"]}
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ export(duckdb_unregister_arrow)
export(read_csv_duckdb)
export(simulate_duckdb)
export(tbl_file)
export(tbl_function)
export(tbl_query)
export(translate_duckdb)
exportClasses(duckdb_connection)
exportClasses(duckdb_driver)
exportClasses(duckdb_explain)
Expand Down
74 changes: 45 additions & 29 deletions R/backend-dbplyr__duckdb_connection.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,26 @@
#' library(dplyr, warn.conflicts = FALSE)
#' con <- DBI::dbConnect(duckdb(), path = ":memory:")
#'
#' dbiris <- copy_to(con, iris, overwrite = TRUE)
#' db <- copy_to(con, data.frame(a = 1:3, b = letters[2:4]))
#'
#' dbiris %>%
#' select(Petal.Length, Petal.Width) %>%
#' filter(Petal.Length > 1.5) %>%
#' head(5)
#' db %>%
#' filter(a > 1) %>%
#' select(b)
#'
#' path <- tempfile(fileext = ".csv")
#' write.csv(data.frame(a = 1:3, b = letters[2:4]))
#'
#' db_csv <- tbl_file(con, path)
#' db_csv %>%
#' summarize(sum_a = sum(a))
#'
#' db_csv_fun <- tbl_function(con, paste0("read_csv_auto('", path, "')"))
#' db_csv %>%
#' count()
#'
#' DBI::dbDisconnect(con, shutdown = TRUE)
NULL

#' Connection object for simulation of the SQL generation without actual database.
#' dbplyr overrides database specific identifier and string quotes
#' @param ... Any parameters to be forwarded
#' @export
#' @rdname backend-duckdb
simulate_duckdb <- function(...) {
structure(list(), ..., class = c("duckdb_connection", "TestConnection", "DBIConnection"))
}

#' Connection object for simulation of the SQL generation without actual database.
#' This version keeps the database specific identifier and string quotes, i.e.
#' allows to translate to DuckDB SQL dialect.
#' @param ... Any parameters to be forwarded
#' @export
#' @rdname backend-duckdb
translate_duckdb <- function(...) {
structure(list(), ..., class = c("duckdb_connection", "DBIConnection"))
}

# Declare which version of dbplyr API is being called.
# @param con A \code{\link{dbConnect}} object, as returned by \code{dbConnect()}
# @name dbplyr_edition
Expand Down Expand Up @@ -437,7 +428,7 @@ tbl.duckdb_connection <- function(src, from, ..., cache = FALSE) {
NextMethod("tbl")
}

#' Create a lazy table from a Parquet or SQL file
#' Create a lazy table from a Parquet file or SQL query
#'
#' `tbl_file()` is an experimental variant of [dplyr::tbl()] to directly access files on disk.
#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the request,
Expand All @@ -458,26 +449,51 @@ tbl_file <- function(src, path, ..., cache = FALSE) {
if (grepl("'", path)) {
stop("File '", path, "' contains a single quote, this is not supported", call. = FALSE)
}
tbl_query(src, paste0("'", path, "'"), cache = cache)
tbl_function(src, paste0("'", path, "'"), cache = cache)
}

#' Create a lazy table from a query
#'
#' `tbl_query()` is an experimental variant of [dplyr::tbl()]
#' @description
#' `tbl_function()` is an experimental variant of [dplyr::tbl()]
#' to create a lazy table from a table-generating function,
#' useful for reading nonstandard CSV files or other data sources.
#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the query.
#' Use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries.
#' See <https://duckdb.org/docs/data/overview> for details on data importing functions.
#'
#' As an alternative, use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries.
#'
#' @param query SQL code, omitting the `FROM` clause
#' @export
#' @rdname backend-duckdb
tbl_query <- function(src, query, ..., cache = FALSE) {
tbl_function <- function(src, query, ..., cache = FALSE) {
if (cache) DBI::dbExecute(src, "PRAGMA enable_object_cache")
table <- dplyr::sql(paste0("FROM ", query))
dplyr::tbl(src, table)
}

#' Deprecated
#'
#' `tbl_query()` is deprecated in favor of `tbl_function()`.
#' @export
#' @rdname backend-duckdb
tbl_query <- function(src, query, ...) {
.Deprecated("tbl_function")
tbl_function(src, query, ...)
}

#' Connection object for simulation of the SQL generation without actual database.
#' dbplyr overrides database specific identifier and string quotes
#'
#' Use `simulate_duckdb()` with `lazy_frame()`
#' to see simulated SQL without opening a DuckDB connection.
#' @param ... Any parameters to be forwarded
#' @export
#' @rdname backend-duckdb
simulate_duckdb <- function(...) {
structure(list(), ..., class = c("duckdb_connection", "TestConnection", "DBIConnection"))
}


# Needed to suppress the R CHECK notes (due to the use of sql_expr)
utils::globalVariables(c("REGEXP_MATCHES", "CAST", "%AS%", "INTEGER", "XOR", "%<<%", "%>>%", "LN", "LOG", "ROUND", "ROUND_EVEN", "EXTRACT", "%FROM%", "MONTH", "STRFTIME", "QUARTER", "YEAR", "DATE_TRUNC", "DATE", "DOY", "TO_SECONDS", "BIGINT", "TO_MINUTES", "TO_HOURS", "TO_DAYS", "TO_WEEKS", "TO_MONTHS", "TO_YEARS", "STRPOS", "NOT", "REGEXP_REPLACE", "TRIM", "LPAD", "RPAD", "%||%", "REPEAT", "LENGTH", "STRING_AGG", "GREATEST", "LIST_EXTRACT", "LOG10", "LOG2", "STRING_SPLIT_REGEX", "FLOOR", "FMOD", "FDIV"))
46 changes: 31 additions & 15 deletions man/backend-duckdb.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 10 additions & 10 deletions tests/testthat/test-tbl__duckdb_connection.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ test_that("Parquet files can be registered with dplyr::tbl()", {
expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
})

test_that("Parquet files can be registered with tbl_file() and tbl_query()", {
test_that("Parquet files can be registered with tbl_file() and tbl_function()", {
skip_if_not_installed("dbplyr")

con <- DBI::dbConnect(duckdb())
Expand All @@ -38,15 +38,15 @@ test_that("Parquet files can be registered with tbl_file() and tbl_query()", {
expect_true(inherits(tab0, "tbl_duckdb_connection"))
expect_true(tab0 %>% dplyr::count() %>% dplyr::collect() == 1000)

tab1 <- tbl_query(con, "read_parquet(['data/userdata1.parquet'])")
tab1 <- tbl_function(con, "read_parquet(['data/userdata1.parquet'])")
expect_true(inherits(tab1, "tbl_duckdb_connection"))
expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 1000)

tab2 <- tbl_query(con, "'data/userdata1.parquet'")
tab2 <- tbl_function(con, "'data/userdata1.parquet'")
expect_true(inherits(tab2, "tbl_duckdb_connection"))
expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 1000)

tab3 <- tbl_query(con, "parquet_scan(['data/userdata1.parquet'])")
tab3 <- tbl_function(con, "parquet_scan(['data/userdata1.parquet'])")
expect_true(inherits(tab3, "tbl_duckdb_connection"))
expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
})
Expand All @@ -69,7 +69,7 @@ test_that("Object cache can be enabled for parquet files with dplyr::tbl()", {
expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
})

test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_query()", {
test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_function()", {
skip_if_not_installed("dbplyr")
# https://github.com/tidyverse/dbplyr/issues/1384
skip_if(packageVersion("dbplyr") >= "2.4.0")
Expand All @@ -82,7 +82,7 @@ test_that("Object cache can be enabled for parquet files with tbl_file() and tbl
expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "true")

DBI::dbExecute(con, "SET enable_object_cache=False;")
tab2 <- tbl_query(con, "'data/userdata1.parquet'", cache = FALSE)
tab2 <- tbl_function(con, "'data/userdata1.parquet'", cache = FALSE)
expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
})

Expand All @@ -108,7 +108,7 @@ test_that("CSV files can be registered with dplyr::tbl()", {
expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
})

test_that("CSV files can be registered with tbl_file() and tbl_query()", {
test_that("CSV files can be registered with tbl_file() and tbl_function()", {
skip_if_not_installed("dbplyr")

path <- file.path(tempdir(), "duckdbtest.csv")
Expand All @@ -122,7 +122,7 @@ test_that("CSV files can be registered with tbl_file() and tbl_query()", {
expect_true(inherits(tab1, "tbl_duckdb_connection"))
expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 150)

tab2 <- tbl_query(con, paste0("read_csv_auto('", path, "')"))
tab2 <- tbl_function(con, paste0("read_csv_auto('", path, "')"))
expect_true(inherits(tab2, "tbl_duckdb_connection"))
expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
})
Expand All @@ -141,13 +141,13 @@ test_that("Other replacement scans or functions can be registered with dplyr::tb
expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
})

test_that("Other replacement scans or functions can be registered with tbl_query()", {
test_that("Other replacement scans or functions can be registered with tbl_function()", {
skip_if_not_installed("dbplyr")

con <- DBI::dbConnect(duckdb())
on.exit(DBI::dbDisconnect(con, shutdown = TRUE))

obj <- tbl_query(con, "duckdb_keywords()")
obj <- tbl_function(con, "duckdb_keywords()")
expect_true(inherits(obj, "tbl_duckdb_connection"))
expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
})
Expand Down

0 comments on commit e77abbd

Please sign in to comment.