From 72787a7684f6c16bdc70eea6c72f14102f9c5e7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Fri, 16 Aug 2024 22:04:54 +0200 Subject: [PATCH 1/2] feat!: Rename `tbl_query()` to `tbl_function()`, remove `translate_duckdb()` --- NAMESPACE | 2 +- R/backend-dbplyr__duckdb_connection.R | 74 ++++++++++++-------- man/backend-duckdb.Rd | 46 ++++++++---- tests/testthat/test-tbl__duckdb_connection.R | 20 +++--- 4 files changed, 87 insertions(+), 55 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 52648629b..8eaec1966 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,8 +24,8 @@ export(duckdb_unregister_arrow) export(read_csv_duckdb) export(simulate_duckdb) export(tbl_file) +export(tbl_function) export(tbl_query) -export(translate_duckdb) exportClasses(duckdb_connection) exportClasses(duckdb_driver) exportClasses(duckdb_explain) diff --git a/R/backend-dbplyr__duckdb_connection.R b/R/backend-dbplyr__duckdb_connection.R index b946dbfe1..9512db8f0 100644 --- a/R/backend-dbplyr__duckdb_connection.R +++ b/R/backend-dbplyr__duckdb_connection.R @@ -11,35 +11,26 @@ #' library(dplyr, warn.conflicts = FALSE) #' con <- DBI::dbConnect(duckdb(), path = ":memory:") #' -#' dbiris <- copy_to(con, iris, overwrite = TRUE) +#' db <- copy_to(con, data.frame(a = 1:3, b = letters[2:4])) #' -#' dbiris %>% -#' select(Petal.Length, Petal.Width) %>% -#' filter(Petal.Length > 1.5) %>% -#' head(5) +#' db %>% +#' filter(a > 1) %>% +#' select(b) +#' +#' path <- tempfile(fileext = ".csv") +#' write.csv(data.frame(a = 1:3, b = letters[2:4])) +#' +#' db_csv <- tbl_file(con, path) +#' db_csv %>% +#' summarize(sum_a = sum(a)) +#' +#' db_csv_fun <- tbl_function(con, paste0("read_csv_auto('", path, "')")) +#' db_csv %>% +#' count() #' #' DBI::dbDisconnect(con, shutdown = TRUE) NULL -#' Connection object for simulation of the SQL generation without actual database. -#' dbplyr overrides database specific identifier and string quotes -#' @param ... Any parameters to be forwarded -#' @export -#' @rdname backend-duckdb -simulate_duckdb <- function(...) { - structure(list(), ..., class = c("duckdb_connection", "TestConnection", "DBIConnection")) -} - -#' Connection object for simulation of the SQL generation without actual database. -#' This version keeps the database specific identifier and string quotes, i.e. -#' allows to translate to DuckDB SQL dialect. -#' @param ... Any parameters to be forwarded -#' @export -#' @rdname backend-duckdb -translate_duckdb <- function(...) { - structure(list(), ..., class = c("duckdb_connection", "DBIConnection")) -} - # Declare which version of dbplyr API is being called. # @param con A \code{\link{dbConnect}} object, as returned by \code{dbConnect()} # @name dbplyr_edition @@ -437,7 +428,7 @@ tbl.duckdb_connection <- function(src, from, ..., cache = FALSE) { NextMethod("tbl") } -#' Create a lazy table from a Parquet or SQL file +#' Create a lazy table from a Parquet file or SQL query #' #' `tbl_file()` is an experimental variant of [dplyr::tbl()] to directly access files on disk. #' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the request, @@ -458,26 +449,51 @@ tbl_file <- function(src, path, ..., cache = FALSE) { if (grepl("'", path)) { stop("File '", path, "' contains a single quote, this is not supported", call. = FALSE) } - tbl_query(src, paste0("'", path, "'"), cache = cache) + tbl_function(src, paste0("'", path, "'"), cache = cache) } #' Create a lazy table from a query #' -#' `tbl_query()` is an experimental variant of [dplyr::tbl()] +#' @description +#' `tbl_function()` is an experimental variant of [dplyr::tbl()] #' to create a lazy table from a table-generating function, #' useful for reading nonstandard CSV files or other data sources. #' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the query. -#' Use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries. #' See for details on data importing functions. #' +#' As an alternative, use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries. +#' #' @param query SQL code, omitting the `FROM` clause #' @export #' @rdname backend-duckdb -tbl_query <- function(src, query, ..., cache = FALSE) { +tbl_function <- function(src, query, ..., cache = FALSE) { if (cache) DBI::dbExecute(src, "PRAGMA enable_object_cache") table <- dplyr::sql(paste0("FROM ", query)) dplyr::tbl(src, table) } +#' Deprecated +#' +#' `tbl_query()` is deprecated in favor of `tbl_function()`. +#' @export +#' @rdname backend-duckdb +tbl_query <- function(src, query, ...) { + .Deprecated("tbl_function") + tbl_function(src, query, ...) +} + +#' Connection object for simulation of the SQL generation without actual database. +#' dbplyr overrides database specific identifier and string quotes +#' +#' Use `simulate_duckdb()` with `lazy_frame()` +#' to see simulated SQL without opening a DuckDB connection. +#' @param ... Any parameters to be forwarded +#' @export +#' @rdname backend-duckdb +simulate_duckdb <- function(...) { + structure(list(), ..., class = c("duckdb_connection", "TestConnection", "DBIConnection")) +} + + # Needed to suppress the R CHECK notes (due to the use of sql_expr) utils::globalVariables(c("REGEXP_MATCHES", "CAST", "%AS%", "INTEGER", "XOR", "%<<%", "%>>%", "LN", "LOG", "ROUND", "ROUND_EVEN", "EXTRACT", "%FROM%", "MONTH", "STRFTIME", "QUARTER", "YEAR", "DATE_TRUNC", "DATE", "DOY", "TO_SECONDS", "BIGINT", "TO_MINUTES", "TO_HOURS", "TO_DAYS", "TO_WEEKS", "TO_MONTHS", "TO_YEARS", "STRPOS", "NOT", "REGEXP_REPLACE", "TRIM", "LPAD", "RPAD", "%||%", "REPEAT", "LENGTH", "STRING_AGG", "GREATEST", "LIST_EXTRACT", "LOG10", "LOG2", "STRING_SPLIT_REGEX", "FLOOR", "FMOD", "FDIV")) diff --git a/man/backend-duckdb.Rd b/man/backend-duckdb.Rd index 904b85b95..8fdd06fe1 100644 --- a/man/backend-duckdb.Rd +++ b/man/backend-duckdb.Rd @@ -1,27 +1,27 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/backend-dbplyr__duckdb_connection.R \name{backend-duckdb} -\alias{simulate_duckdb} -\alias{translate_duckdb} \alias{tbl_file} +\alias{tbl_function} \alias{tbl_query} +\alias{simulate_duckdb} \title{DuckDB SQL backend for dbplyr} \usage{ -simulate_duckdb(...) +tbl_file(src, path, ..., cache = FALSE) -translate_duckdb(...) +tbl_function(src, query, ..., cache = FALSE) -tbl_file(src, path, ..., cache = FALSE) +tbl_query(src, query, ...) -tbl_query(src, query, ..., cache = FALSE) +simulate_duckdb(...) } \arguments{ -\item{...}{Any parameters to be forwarded} - \item{src}{A duckdb connection object} \item{path}{Path to existing Parquet, CSV or JSON file} +\item{...}{Any parameters to be forwarded} + \item{cache}{Enable object cache for Parquet files} \item{query}{SQL code, omitting the \code{FROM} clause} @@ -35,24 +35,40 @@ contains more mapped functions. It is safer than \code{dplyr::tbl()} because there is no risk of misinterpreting the request, and paths with special characters are supported. -\code{tbl_query()} is an experimental variant of \code{\link[dplyr:tbl]{dplyr::tbl()}} +\code{tbl_function()} is an experimental variant of \code{\link[dplyr:tbl]{dplyr::tbl()}} to create a lazy table from a table-generating function, useful for reading nonstandard CSV files or other data sources. It is safer than \code{dplyr::tbl()} because there is no risk of misinterpreting the query. -Use \code{dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))} for custom SQL queries. See \url{https://duckdb.org/docs/data/overview} for details on data importing functions. + +As an alternative, use \code{dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))} for custom SQL queries. + +\code{tbl_query()} is deprecated in favor of \code{tbl_function()}. + +Use \code{simulate_duckdb()} with \code{lazy_frame()} +to see simulated SQL without opening a DuckDB connection. } \examples{ \dontshow{if (duckdb:::TEST_RE2 && rlang::is_installed("dbplyr")) withAutoprint(\{ # examplesIf} library(dplyr, warn.conflicts = FALSE) con <- DBI::dbConnect(duckdb(), path = ":memory:") -dbiris <- copy_to(con, iris, overwrite = TRUE) +db <- copy_to(con, data.frame(a = 1:3, b = letters[2:4])) + +db \%>\% + filter(a > 1) \%>\% + select(b) + +path <- tempfile(fileext = ".csv") +write.csv(data.frame(a = 1:3, b = letters[2:4])) + +db_csv <- tbl_file(con, path) +db_csv \%>\% + summarize(sum_a = sum(a)) -dbiris \%>\% - select(Petal.Length, Petal.Width) \%>\% - filter(Petal.Length > 1.5) \%>\% - head(5) +db_csv_fun <- tbl_function(con, paste0("read_csv_auto('", path, "')")) +db_csv \%>\% + count() DBI::dbDisconnect(con, shutdown = TRUE) \dontshow{\}) # examplesIf} diff --git a/tests/testthat/test-tbl__duckdb_connection.R b/tests/testthat/test-tbl__duckdb_connection.R index 522d172ab..09c56ecb1 100644 --- a/tests/testthat/test-tbl__duckdb_connection.R +++ b/tests/testthat/test-tbl__duckdb_connection.R @@ -28,7 +28,7 @@ test_that("Parquet files can be registered with dplyr::tbl()", { expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000) }) -test_that("Parquet files can be registered with tbl_file() and tbl_query()", { +test_that("Parquet files can be registered with tbl_file() and tbl_function()", { skip_if_not_installed("dbplyr") con <- DBI::dbConnect(duckdb()) @@ -38,15 +38,15 @@ test_that("Parquet files can be registered with tbl_file() and tbl_query()", { expect_true(inherits(tab0, "tbl_duckdb_connection")) expect_true(tab0 %>% dplyr::count() %>% dplyr::collect() == 1000) - tab1 <- tbl_query(con, "read_parquet(['data/userdata1.parquet'])") + tab1 <- tbl_function(con, "read_parquet(['data/userdata1.parquet'])") expect_true(inherits(tab1, "tbl_duckdb_connection")) expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 1000) - tab2 <- tbl_query(con, "'data/userdata1.parquet'") + tab2 <- tbl_function(con, "'data/userdata1.parquet'") expect_true(inherits(tab2, "tbl_duckdb_connection")) expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 1000) - tab3 <- tbl_query(con, "parquet_scan(['data/userdata1.parquet'])") + tab3 <- tbl_function(con, "parquet_scan(['data/userdata1.parquet'])") expect_true(inherits(tab3, "tbl_duckdb_connection")) expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000) }) @@ -69,7 +69,7 @@ test_that("Object cache can be enabled for parquet files with dplyr::tbl()", { expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false") }) -test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_query()", { +test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_function()", { skip_if_not_installed("dbplyr") # https://github.com/tidyverse/dbplyr/issues/1384 skip_if(packageVersion("dbplyr") >= "2.4.0") @@ -82,7 +82,7 @@ test_that("Object cache can be enabled for parquet files with tbl_file() and tbl expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "true") DBI::dbExecute(con, "SET enable_object_cache=False;") - tab2 <- tbl_query(con, "'data/userdata1.parquet'", cache = FALSE) + tab2 <- tbl_function(con, "'data/userdata1.parquet'", cache = FALSE) expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false") }) @@ -108,7 +108,7 @@ test_that("CSV files can be registered with dplyr::tbl()", { expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150) }) -test_that("CSV files can be registered with tbl_file() and tbl_query()", { +test_that("CSV files can be registered with tbl_file() and tbl_function()", { skip_if_not_installed("dbplyr") path <- file.path(tempdir(), "duckdbtest.csv") @@ -122,7 +122,7 @@ test_that("CSV files can be registered with tbl_file() and tbl_query()", { expect_true(inherits(tab1, "tbl_duckdb_connection")) expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 150) - tab2 <- tbl_query(con, paste0("read_csv_auto('", path, "')")) + tab2 <- tbl_function(con, paste0("read_csv_auto('", path, "')")) expect_true(inherits(tab2, "tbl_duckdb_connection")) expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150) }) @@ -141,13 +141,13 @@ test_that("Other replacement scans or functions can be registered with dplyr::tb expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1) }) -test_that("Other replacement scans or functions can be registered with tbl_query()", { +test_that("Other replacement scans or functions can be registered with tbl_function()", { skip_if_not_installed("dbplyr") con <- DBI::dbConnect(duckdb()) on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) - obj <- tbl_query(con, "duckdb_keywords()") + obj <- tbl_function(con, "duckdb_keywords()") expect_true(inherits(obj, "tbl_duckdb_connection")) expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1) }) From 79b64625d26edbde36267a7ec85ec72d66044aec Mon Sep 17 00:00:00 2001 From: krlmlr Date: Fri, 16 Aug 2024 20:23:28 +0000 Subject: [PATCH 2/2] chore: Auto-update from GitHub Actions Run: https://github.com/duckdb/duckdb-r/actions/runs/10425747078 --- .github/dep-suggests-matrix.json | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/dep-suggests-matrix.json b/.github/dep-suggests-matrix.json index fc09c64f5..e69de29bb 100644 --- a/.github/dep-suggests-matrix.json +++ b/.github/dep-suggests-matrix.json @@ -1 +0,0 @@ -{"package":["adbcdrivermanager","arrow","bit64","clock","DBItest","dbplyr","dplyr"]}