Merge pull request #211 from duckdb/f-133-tbl

duckdb · Aug 17, 2024 · e77abbd · e77abbd
2 parents 7c94631 + 79b6462
commit e77abbd
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 56 deletions.
diff --git a/.github/dep-suggests-matrix.json b/.github/dep-suggests-matrix.json
@@ -1 +0,0 @@
-{"package":["adbcdrivermanager","arrow","bit64","clock","DBItest","dbplyr","dplyr"]}

diff --git a/NAMESPACE b/NAMESPACE
@@ -24,8 +24,8 @@ export(duckdb_unregister_arrow)
 export(read_csv_duckdb)
 export(simulate_duckdb)
 export(tbl_file)
+export(tbl_function)
 export(tbl_query)
-export(translate_duckdb)
 exportClasses(duckdb_connection)
 exportClasses(duckdb_driver)
 exportClasses(duckdb_explain)

diff --git a/R/backend-dbplyr__duckdb_connection.R b/R/backend-dbplyr__duckdb_connection.R
@@ -11,35 +11,26 @@
 #' library(dplyr, warn.conflicts = FALSE)
 #' con <- DBI::dbConnect(duckdb(), path = ":memory:")
 #'
-#' dbiris <- copy_to(con, iris, overwrite = TRUE)
+#' db <- copy_to(con, data.frame(a = 1:3, b = letters[2:4]))
 #'
-#' dbiris %>%
-#'   select(Petal.Length, Petal.Width) %>%
-#'   filter(Petal.Length > 1.5) %>%
-#'   head(5)
+#' db %>%
+#'   filter(a > 1) %>%
+#'   select(b)
+#'
+#' path <- tempfile(fileext = ".csv")
+#' write.csv(data.frame(a = 1:3, b = letters[2:4]))
+#'
+#' db_csv <- tbl_file(con, path)
+#' db_csv %>%
+#'   summarize(sum_a = sum(a))
+#'
+#' db_csv_fun <- tbl_function(con, paste0("read_csv_auto('", path, "')"))
+#' db_csv %>%
+#'   count()
 #'
 #' DBI::dbDisconnect(con, shutdown = TRUE)
 NULL
 
-#' Connection object for simulation of the SQL generation without actual database.
-#' dbplyr overrides database specific identifier and string quotes
-#' @param ... Any parameters to be forwarded
-#' @export
-#' @rdname backend-duckdb
-simulate_duckdb <- function(...) {
-  structure(list(), ..., class = c("duckdb_connection", "TestConnection", "DBIConnection"))
-}
-
-#' Connection object for simulation of the SQL generation without actual database.
-#' This version keeps the database specific identifier and string quotes, i.e.
-#' allows to translate to DuckDB SQL dialect.
-#' @param ... Any parameters to be forwarded
-#' @export
-#' @rdname backend-duckdb
-translate_duckdb <- function(...) {
-  structure(list(), ..., class = c("duckdb_connection", "DBIConnection"))
-}
-
 # Declare which version of dbplyr API is being called.
 # @param con A \code{\link{dbConnect}} object, as returned by \code{dbConnect()}
 # @name dbplyr_edition
@@ -437,7 +428,7 @@ tbl.duckdb_connection <- function(src, from, ..., cache = FALSE) {
   NextMethod("tbl")
 }
 
-#' Create a lazy table from a Parquet or SQL file
+#' Create a lazy table from a Parquet file or SQL query
 #'
 #' `tbl_file()` is an experimental variant of [dplyr::tbl()] to directly access files on disk.
 #' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the request,
@@ -458,26 +449,51 @@ tbl_file <- function(src, path, ..., cache = FALSE) {
   if (grepl("'", path)) {
     stop("File '", path, "' contains a single quote, this is not supported", call. = FALSE)
   }
-  tbl_query(src, paste0("'", path, "'"), cache = cache)
+  tbl_function(src, paste0("'", path, "'"), cache = cache)
 }
 
 #' Create a lazy table from a query
 #'
-#' `tbl_query()` is an experimental variant of [dplyr::tbl()]
+#' @description
+#' `tbl_function()` is an experimental variant of [dplyr::tbl()]
 #' to create a lazy table from a table-generating function,
 #' useful for reading nonstandard CSV files or other data sources.
 #' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the query.
-#' Use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries.
 #' See <https://duckdb.org/docs/data/overview> for details on data importing functions.
 #'
+#' As an alternative, use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries.
+#'
 #' @param query SQL code, omitting the `FROM` clause
 #' @export
 #' @rdname backend-duckdb
-tbl_query <- function(src, query, ..., cache = FALSE) {
+tbl_function <- function(src, query, ..., cache = FALSE) {
   if (cache) DBI::dbExecute(src, "PRAGMA enable_object_cache")
   table <- dplyr::sql(paste0("FROM ", query))
   dplyr::tbl(src, table)
 }
 
+#' Deprecated
+#'
+#' `tbl_query()` is deprecated in favor of `tbl_function()`.
+#' @export
+#' @rdname backend-duckdb
+tbl_query <- function(src, query, ...) {
+  .Deprecated("tbl_function")
+  tbl_function(src, query, ...)
+}
+
+#' Connection object for simulation of the SQL generation without actual database.
+#' dbplyr overrides database specific identifier and string quotes
+#'
+#' Use `simulate_duckdb()` with `lazy_frame()`
+#' to see simulated SQL without opening a DuckDB connection.
+#' @param ... Any parameters to be forwarded
+#' @export
+#' @rdname backend-duckdb
+simulate_duckdb <- function(...) {
+  structure(list(), ..., class = c("duckdb_connection", "TestConnection", "DBIConnection"))
+}
+
+
 # Needed to suppress the R CHECK notes (due to the use of sql_expr)
 utils::globalVariables(c("REGEXP_MATCHES", "CAST", "%AS%", "INTEGER", "XOR", "%<<%", "%>>%", "LN", "LOG", "ROUND", "ROUND_EVEN", "EXTRACT", "%FROM%", "MONTH", "STRFTIME", "QUARTER", "YEAR", "DATE_TRUNC", "DATE", "DOY", "TO_SECONDS", "BIGINT", "TO_MINUTES", "TO_HOURS", "TO_DAYS", "TO_WEEKS", "TO_MONTHS", "TO_YEARS", "STRPOS", "NOT", "REGEXP_REPLACE", "TRIM", "LPAD", "RPAD", "%||%", "REPEAT", "LENGTH", "STRING_AGG", "GREATEST", "LIST_EXTRACT", "LOG10", "LOG2", "STRING_SPLIT_REGEX", "FLOOR", "FMOD", "FDIV"))
diff --git a/man/backend-duckdb.Rd b/man/backend-duckdb.Rd
diff --git a/tests/testthat/test-tbl__duckdb_connection.R b/tests/testthat/test-tbl__duckdb_connection.R
@@ -28,7 +28,7 @@ test_that("Parquet files can be registered with dplyr::tbl()", {
   expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
 })
 
-test_that("Parquet files can be registered with tbl_file() and tbl_query()", {
+test_that("Parquet files can be registered with tbl_file() and tbl_function()", {
   skip_if_not_installed("dbplyr")
 
   con <- DBI::dbConnect(duckdb())
@@ -38,15 +38,15 @@ test_that("Parquet files can be registered with tbl_file() and tbl_query()", {
   expect_true(inherits(tab0, "tbl_duckdb_connection"))
   expect_true(tab0 %>% dplyr::count() %>% dplyr::collect() == 1000)
 
-  tab1 <- tbl_query(con, "read_parquet(['data/userdata1.parquet'])")
+  tab1 <- tbl_function(con, "read_parquet(['data/userdata1.parquet'])")
   expect_true(inherits(tab1, "tbl_duckdb_connection"))
   expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 1000)
 
-  tab2 <- tbl_query(con, "'data/userdata1.parquet'")
+  tab2 <- tbl_function(con, "'data/userdata1.parquet'")
   expect_true(inherits(tab2, "tbl_duckdb_connection"))
   expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 1000)
 
-  tab3 <- tbl_query(con, "parquet_scan(['data/userdata1.parquet'])")
+  tab3 <- tbl_function(con, "parquet_scan(['data/userdata1.parquet'])")
   expect_true(inherits(tab3, "tbl_duckdb_connection"))
   expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
 })
@@ -69,7 +69,7 @@ test_that("Object cache can be enabled for parquet files with dplyr::tbl()", {
   expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
 })
 
-test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_query()", {
+test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_function()", {
   skip_if_not_installed("dbplyr")
   # https://github.com/tidyverse/dbplyr/issues/1384
   skip_if(packageVersion("dbplyr") >= "2.4.0")
@@ -82,7 +82,7 @@ test_that("Object cache can be enabled for parquet files with tbl_file() and tbl
   expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "true")
 
   DBI::dbExecute(con, "SET enable_object_cache=False;")
-  tab2 <- tbl_query(con, "'data/userdata1.parquet'", cache = FALSE)
+  tab2 <- tbl_function(con, "'data/userdata1.parquet'", cache = FALSE)
   expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
 })
 
@@ -108,7 +108,7 @@ test_that("CSV files can be registered with dplyr::tbl()", {
   expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
 })
 
-test_that("CSV files can be registered with tbl_file() and tbl_query()", {
+test_that("CSV files can be registered with tbl_file() and tbl_function()", {
   skip_if_not_installed("dbplyr")
 
   path <- file.path(tempdir(), "duckdbtest.csv")
@@ -122,7 +122,7 @@ test_that("CSV files can be registered with tbl_file() and tbl_query()", {
   expect_true(inherits(tab1, "tbl_duckdb_connection"))
   expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 150)
 
-  tab2 <- tbl_query(con, paste0("read_csv_auto('", path, "')"))
+  tab2 <- tbl_function(con, paste0("read_csv_auto('", path, "')"))
   expect_true(inherits(tab2, "tbl_duckdb_connection"))
   expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
 })
@@ -141,13 +141,13 @@ test_that("Other replacement scans or functions can be registered with dplyr::tb
   expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
 })
 
-test_that("Other replacement scans or functions can be registered with tbl_query()", {
+test_that("Other replacement scans or functions can be registered with tbl_function()", {
   skip_if_not_installed("dbplyr")
 
   con <- DBI::dbConnect(duckdb())
   on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
 
-  obj <- tbl_query(con, "duckdb_keywords()")
+  obj <- tbl_function(con, "duckdb_keywords()")
   expect_true(inherits(obj, "tbl_duckdb_connection"))
   expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
 })