From 89544ffafa789eba982ef42197c45f1beedb476f Mon Sep 17 00:00:00 2001
From: Marc Becker <33069354+be-marc@users.noreply.github.com>
Date: Fri, 20 Dec 2024 16:53:15 +0100
Subject: [PATCH] refactor: allow uft8 in column names by default (#1234)

---
 NEWS.md                         |  2 ++
 R/DataBackendRename.R           |  2 +-
 R/Task.R                        | 10 +++-------
 R/helper.R                      |  5 -----
 R/zzz.R                         |  4 ----
 inst/testthat/helper_autotest.R |  2 --
 man/mlr3-package.Rd             |  4 ----
 tests/testthat/test_Task.R      |  9 ---------
 8 files changed, 6 insertions(+), 32 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 6992b98cd..69e7292de 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,7 @@
 # mlr3 (development version)
 
+* Column names with UTF-8 characters are now allowed by default.
+The option `mlr3.allow_utf8_names` is removed.
 * BREAKING CHANGE: `Learner$predict_types` is read-only now.
 * docs: Clear up behavior of `Learner$predict_type` after training.
 
diff --git a/R/DataBackendRename.R b/R/DataBackendRename.R
index 81617a283..d4838d5e1 100644
--- a/R/DataBackendRename.R
+++ b/R/DataBackendRename.R
@@ -9,7 +9,7 @@ DataBackendRename = R6Class("DataBackendRename", inherit = DataBackend, cloneabl
       assert_character(old, any.missing = FALSE, unique = TRUE)
       assert_subset(old, b$colnames)
       assert_character(new, any.missing = FALSE, len = length(old))
-      assert_names(new, if (allow_utf8_names()) "unique" else "strict")
+      assert_names(new, "unique")
 
       ii = old != new
       old = old[ii]
diff --git a/R/Task.R b/R/Task.R
index a081f15ca..0f077c477 100644
--- a/R/Task.R
+++ b/R/Task.R
@@ -128,13 +128,9 @@ Task = R6Class("Task",
       cn = self$backend$colnames
       rn = self$backend$rownames
 
-      if (allow_utf8_names()) {
-        assert_names(cn, "unique", .var.name = "column names")
-        if (any(grepl("%", cn, fixed = TRUE))) {
-          stopf("Column names may not contain special character '%%'")
-        }
-      } else {
-        assert_names(cn, "strict", .var.name = "column names")
+      assert_names(cn, "unique", .var.name = "column names")
+      if (any(grepl("%", cn, fixed = TRUE))) {
+        stopf("Column names may not contain special character '%%'")
       }
 
       self$col_info = col_info(self$backend)
diff --git a/R/helper.R b/R/helper.R
index b27ecb192..39460eb08 100644
--- a/R/helper.R
+++ b/R/helper.R
@@ -4,11 +4,6 @@ translate_types = function(x) {
   factor(map_values(x, r_types, p_types), levels = p_types)
 }
 
-
-allow_utf8_names = function() {
-  isTRUE(getOption("mlr3.allow_utf8_names"))
-}
-
 get_featureless_learner = function(task_type) {
   if (!is.na(task_type)) {
     id = paste0(task_type, ".featureless")
diff --git a/R/zzz.R b/R/zzz.R
index b34c219ba..84f0077b3 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -50,10 +50,6 @@
 #' * `"mlr3.debug"`: If set to `TRUE`, parallelization via \CRANpkg{future} is disabled to simplify
 #'   debugging and provide more concise tracebacks.
 #'   Note that results computed in debug mode use a different seeding mechanism and are **not reproducible**.
-#' * `"mlr3.allow_utf8_names"`: If set to `TRUE`, checks on the feature names are relaxed, allowing
-#'   non-ascii characters in column names. This is an experimental and temporal option to
-#'   pave the way for text analysis, and will likely be removed in a future version of the package.
-#'   analysis.
 #' * `"mlr3.warn_version_mismatch"`: Set to `FALSE` to silence warnings raised during predict if a learner has been
 #'   trained with a different version version of mlr3.
 #'
diff --git a/inst/testthat/helper_autotest.R b/inst/testthat/helper_autotest.R
index 377e7d152..f1f647b34 100644
--- a/inst/testthat/helper_autotest.R
+++ b/inst/testthat/helper_autotest.R
@@ -81,8 +81,6 @@ generate_generic_tasks = function(learner, proto) {
 
   # task with non-ascii feature names
   if (p > 0L) {
-    opts = options(mlr3.allow_utf8_names = TRUE)
-    on.exit(options(opts))
     sel = proto$feature_types[list(learner$feature_types), "id", on = "type", with = FALSE, nomatch = NULL][[1L]]
     tasks$utf8_feature_names = proto$clone(deep = TRUE)$select(sel)
     old = sel[1L]
diff --git a/man/mlr3-package.Rd b/man/mlr3-package.Rd
index fb5185e56..9c4f5f689 100644
--- a/man/mlr3-package.Rd
+++ b/man/mlr3-package.Rd
@@ -63,10 +63,6 @@ parallelization with \CRANpkg{future}. Defaults to 1.
 \item \code{"mlr3.debug"}: If set to \code{TRUE}, parallelization via \CRANpkg{future} is disabled to simplify
 debugging and provide more concise tracebacks.
 Note that results computed in debug mode use a different seeding mechanism and are \strong{not reproducible}.
-\item \code{"mlr3.allow_utf8_names"}: If set to \code{TRUE}, checks on the feature names are relaxed, allowing
-non-ascii characters in column names. This is an experimental and temporal option to
-pave the way for text analysis, and will likely be removed in a future version of the package.
-analysis.
 \item \code{"mlr3.warn_version_mismatch"}: Set to \code{FALSE} to silence warnings raised during predict if a learner has been
 trained with a different version version of mlr3.
 }
diff --git a/tests/testthat/test_Task.R b/tests/testthat/test_Task.R
index b1005c81d..880b2726b 100644
--- a/tests/testthat/test_Task.R
+++ b/tests/testthat/test_Task.R
@@ -552,15 +552,6 @@ test_that("set_levels", {
 })
 
 test_that("special chars in feature names (#697)", {
-  prev = options(mlr3.allow_utf8_names = FALSE)
-  on.exit(options(prev))
-
-  expect_error(
-    TaskRegr$new("test", data.table(`%^` = 1:3, t = 3:1), target = "t"),
-    "comply"
-  )
-  options(mlr3.allow_utf8_names = TRUE)
-
   expect_error(
     TaskRegr$new("test", data.table(`%asd` = 1:3, t = 3:1), target = "t")
     ,