From 89544ffafa789eba982ef42197c45f1beedb476f Mon Sep 17 00:00:00 2001 From: Marc Becker <33069354+be-marc@users.noreply.github.com> Date: Fri, 20 Dec 2024 16:53:15 +0100 Subject: [PATCH] refactor: allow uft8 in column names by default (#1234) --- NEWS.md | 2 ++ R/DataBackendRename.R | 2 +- R/Task.R | 10 +++------- R/helper.R | 5 ----- R/zzz.R | 4 ---- inst/testthat/helper_autotest.R | 2 -- man/mlr3-package.Rd | 4 ---- tests/testthat/test_Task.R | 9 --------- 8 files changed, 6 insertions(+), 32 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6992b98cd..69e7292de 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # mlr3 (development version) +* Column names with UTF-8 characters are now allowed by default. +The option `mlr3.allow_utf8_names` is removed. * BREAKING CHANGE: `Learner$predict_types` is read-only now. * docs: Clear up behavior of `Learner$predict_type` after training. diff --git a/R/DataBackendRename.R b/R/DataBackendRename.R index 81617a283..d4838d5e1 100644 --- a/R/DataBackendRename.R +++ b/R/DataBackendRename.R @@ -9,7 +9,7 @@ DataBackendRename = R6Class("DataBackendRename", inherit = DataBackend, cloneabl assert_character(old, any.missing = FALSE, unique = TRUE) assert_subset(old, b$colnames) assert_character(new, any.missing = FALSE, len = length(old)) - assert_names(new, if (allow_utf8_names()) "unique" else "strict") + assert_names(new, "unique") ii = old != new old = old[ii] diff --git a/R/Task.R b/R/Task.R index a081f15ca..0f077c477 100644 --- a/R/Task.R +++ b/R/Task.R @@ -128,13 +128,9 @@ Task = R6Class("Task", cn = self$backend$colnames rn = self$backend$rownames - if (allow_utf8_names()) { - assert_names(cn, "unique", .var.name = "column names") - if (any(grepl("%", cn, fixed = TRUE))) { - stopf("Column names may not contain special character '%%'") - } - } else { - assert_names(cn, "strict", .var.name = "column names") + assert_names(cn, "unique", .var.name = "column names") + if (any(grepl("%", cn, fixed = TRUE))) { + stopf("Column names may not contain special character '%%'") } self$col_info = col_info(self$backend) diff --git a/R/helper.R b/R/helper.R index b27ecb192..39460eb08 100644 --- a/R/helper.R +++ b/R/helper.R @@ -4,11 +4,6 @@ translate_types = function(x) { factor(map_values(x, r_types, p_types), levels = p_types) } - -allow_utf8_names = function() { - isTRUE(getOption("mlr3.allow_utf8_names")) -} - get_featureless_learner = function(task_type) { if (!is.na(task_type)) { id = paste0(task_type, ".featureless") diff --git a/R/zzz.R b/R/zzz.R index b34c219ba..84f0077b3 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -50,10 +50,6 @@ #' * `"mlr3.debug"`: If set to `TRUE`, parallelization via \CRANpkg{future} is disabled to simplify #' debugging and provide more concise tracebacks. #' Note that results computed in debug mode use a different seeding mechanism and are **not reproducible**. -#' * `"mlr3.allow_utf8_names"`: If set to `TRUE`, checks on the feature names are relaxed, allowing -#' non-ascii characters in column names. This is an experimental and temporal option to -#' pave the way for text analysis, and will likely be removed in a future version of the package. -#' analysis. #' * `"mlr3.warn_version_mismatch"`: Set to `FALSE` to silence warnings raised during predict if a learner has been #' trained with a different version version of mlr3. #' diff --git a/inst/testthat/helper_autotest.R b/inst/testthat/helper_autotest.R index 377e7d152..f1f647b34 100644 --- a/inst/testthat/helper_autotest.R +++ b/inst/testthat/helper_autotest.R @@ -81,8 +81,6 @@ generate_generic_tasks = function(learner, proto) { # task with non-ascii feature names if (p > 0L) { - opts = options(mlr3.allow_utf8_names = TRUE) - on.exit(options(opts)) sel = proto$feature_types[list(learner$feature_types), "id", on = "type", with = FALSE, nomatch = NULL][[1L]] tasks$utf8_feature_names = proto$clone(deep = TRUE)$select(sel) old = sel[1L] diff --git a/man/mlr3-package.Rd b/man/mlr3-package.Rd index fb5185e56..9c4f5f689 100644 --- a/man/mlr3-package.Rd +++ b/man/mlr3-package.Rd @@ -63,10 +63,6 @@ parallelization with \CRANpkg{future}. Defaults to 1. \item \code{"mlr3.debug"}: If set to \code{TRUE}, parallelization via \CRANpkg{future} is disabled to simplify debugging and provide more concise tracebacks. Note that results computed in debug mode use a different seeding mechanism and are \strong{not reproducible}. -\item \code{"mlr3.allow_utf8_names"}: If set to \code{TRUE}, checks on the feature names are relaxed, allowing -non-ascii characters in column names. This is an experimental and temporal option to -pave the way for text analysis, and will likely be removed in a future version of the package. -analysis. \item \code{"mlr3.warn_version_mismatch"}: Set to \code{FALSE} to silence warnings raised during predict if a learner has been trained with a different version version of mlr3. } diff --git a/tests/testthat/test_Task.R b/tests/testthat/test_Task.R index b1005c81d..880b2726b 100644 --- a/tests/testthat/test_Task.R +++ b/tests/testthat/test_Task.R @@ -552,15 +552,6 @@ test_that("set_levels", { }) test_that("special chars in feature names (#697)", { - prev = options(mlr3.allow_utf8_names = FALSE) - on.exit(options(prev)) - - expect_error( - TaskRegr$new("test", data.table(`%^` = 1:3, t = 3:1), target = "t"), - "comply" - ) - options(mlr3.allow_utf8_names = TRUE) - expect_error( TaskRegr$new("test", data.table(`%asd` = 1:3, t = 3:1), target = "t") ,