Prepare release (#673)

mlr-org · Aug 5, 2021 · 0df584c · 0df584c
1 parent 30b402d
commit 0df584c
Show file tree

Hide file tree

Showing 15 changed files with 96 additions and 87 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: mlr3
 Title: Machine Learning in R - Next Generation
-Version: 0.11.0-9000
+Version: 0.12.0
 Authors@R:
     c(person(given = "Michel",
              family = "Lang",

diff --git a/NEWS.md b/NEWS.md
@@ -1,21 +1,28 @@
-# mlr3 0.11.0-9000
+# mlr3 0.12.0
 
-* New method to label columns in tasks: `Task$label()`.
+* New method to assign labels to columns in tasks: `Task$label()`.
+  These will be used in visualizations in the future.
 * New method to add stratification variables: `Task$add_strata()`.
 * New helper function `partition()` to split a task into a training and test
   set.
 * New standardized getter `loglik()` for class `Learner`.
 * New measures `"aic"` and `"bic"` to compute the Akaike Information Criterion
   or the Bayesian Information Criterion, respectively.
-* New Resampling method: `ResamplingCustomCV`.
+* New Resampling method: `ResamplingCustomCV`. Creates a custom resampling split
+  based on the levels of a user-provided factor variable.
 * New argument `encapsulate` for `resample()` and `benchmark()` to conveniently
-  enable encapsulation and also set the fallback learner to the respective
+  enable encapsulation and also set the fallback learner to the
   featureless learner. This is simply for convenience, configuring each learner
   individually is still possible and allows a more fine-grained control (#634,
   #642).
+* New field `parallel_predict` for `Learner` to enable parallel predictions via
+  the future backend. This currently is only enabled while calling the
+  `$predict()` or `$predict_newdata` methods and is disabled during `resample()`
+  and `benchmark()` where you have other means to parallelize.
 * Deprecated public (and already documented as internal) field `$data` in
   `ResampleResult` and `BenchmarkResult` to simplify the API and avoid
-  confusion. The converter `as.data.table()` can be used instead.
+  confusion. The converter `as.data.table()` can be used instead to access the
+  internal data.
 * Measures now have formal hyperparameters. A popular example where this is
   required is the F1 score, now implemented with customizable `beta`.
 * Changed default of argument `ordered` in `Task$data()` from `TRUE` to `FALSE`.

diff --git a/R/Learner.R b/R/Learner.R
@@ -395,7 +395,8 @@ Learner = R6Class("Learner",
     #' @template field_hash
     hash = function(rhs) {
       assert_ro_binding(rhs)
-      calculate_hash(class(self), self$id, self$param_set$values, private$.predict_type, self$fallback$hash)
+      calculate_hash(class(self), self$id, self$param_set$values, private$.predict_type,
+        self$fallback$hash, self$parallel_predict)
     },
 
     #' @field phash (`character(1)`)\cr

diff --git a/R/Task.R b/R/Task.R
@@ -583,29 +583,6 @@ Task = R6Class("Task",
       setnames(strata, sprintf("..stratum_%s", cols))
       self$cbind(strata)
       self$set_col_roles(names(strata), roles = "stratum")
-    },
-
-
-    #' @description
-    #' Assigns `labels` (prettier formated names) to columns `cols`.
-    #' Internally updates the column `label` of the table in field `col_info` by reference.
-    #'
-    #' @param cols (`character()`)\cr
-    #'   Column identifiers to label.
-    #' @param labels (`character()`)\cr
-    #'   New labels. Will be repeated to match the length of `cols`.
-    #'   Set to `NA` to remove a label.
-    #'
-    #' @return Modified `self`.
-    label = function(cols, labels) {
-      assert_character(cols, any.missing = FALSE, unique = TRUE)
-      assert_character(labels)
-      assert_subset(cols, self$col_info$id)
-      labels = rep_len(as.character(labels), length(cols))
-
-      self$col_info[list(cols), "label" := labels, on = "id"]
-
-      invisible(self)
     }
   ),
 
@@ -842,6 +819,38 @@ Task = R6Class("Task",
       }
       data = self$backend$data(private$.row_roles$use, c(self$backend$primary_key, weight_cols))
       setnames(data, c("row_id", "weight"))[]
+    },
+
+
+    #' @field labels (named `character()`)\cr
+    #'   Retrieve `labels` (prettier formated names) from columns.
+    #'   Internally queries the column `label` of the table in field `col_info`.
+    #'   Columns ids referenced by the name of the vector, the labels are the actual string values.
+    #'
+    #'   Assigning to this column update the task by reference.
+    #'   You have to provide a character vector of labels, named with column ids.
+    #'   To remove a label, set it to `NA`.
+    #'   Alternatively, you can provide a [data.frame()] with the two columns
+    #'   `"id"` and `"label"`.
+    labels = function(rhs) {
+      active = union(self$target_names, self$feature_names)
+
+      if (missing(rhs)) {
+        tab = self$col_info[list(active), c("id", "label"), on = "id", nomatch = NULL, with = FALSE]
+        return(set_names(tab[["label"]], tab[["id"]]))
+      }
+
+      if (is.data.frame(rhs)) { # convert to named character
+        assert_data_frame(rhs, ncols = 2L)
+        assert_names(names(rhs), permutation.of = c("id", "label"))
+        rhs = set_names(rhs[["label"]], rhs[["id"]])
+      }
+
+      assert_names(names(rhs), type = "unique")
+      assert_subset(names(rhs), active)
+      self$col_info[list(names(rhs)), "label" := rhs, on = "id"]
+
+      invisible(self)
     }
   ),
 

diff --git a/R/partition.R b/R/partition.R
@@ -19,8 +19,14 @@
 #' # regression task
 #' task = tsk("boston_housing")
 #'
-#' # roughly equal size split with stratification
-#' str(partition(task, ratio = 0.5))
+#' # roughly equal size split while stratifying on the binned response
+#' split = partition(task, ratio = 0.5)
+#' data = data.frame(
+#'   y = c(task$truth(split$train), task$truth(split$test)),
+#'   split = rep(c("train", "predict"), lengths(split))
+#' )
+#' boxplot(y ~ split, data = data)
+#'
 #'
 #' # classification task
 #' task = tsk("pima")

diff --git a/R/zzz.R b/R/zzz.R
@@ -14,7 +14,7 @@
 #' @section Learn mlr3:
 #' * Book on mlr3: \url{https://mlr3book.mlr-org.com}
 #' * Use cases and examples gallery: \url{https://mlr3gallery.mlr-org.com}
-#' * Cheat Sheets: \url{https://cheatsheets.mlr-org.com}
+#' * Cheat Sheets: \url{https://github.com/mlr-org/mlr3cheatsheets}
 #'
 #' @section mlr3 extensions:
 #' * Preprocessing and machine learning pipelines: \CRANpkg{mlr3pipelines}

diff --git a/README.md b/README.md
@@ -41,10 +41,11 @@ Status](https://www.r-pkg.org/badges/version-ago/mlr3)](https://cran.r-project.o
         status](https://mlr3extralearners.mlr-org.com/articles/learners/learner_status.html)
         to see their build status
 -   **Cheatsheets**
-    -   [Overview of cheatsheets](https://cheatsheets.mlr-org.com)
+    -   [Overview of cheatsheets](https://github.com/mlr-org/mlr3cheatsheets)
     -   [mlr3](https://cheatsheets.mlr-org.com/mlr3.pdf)
     -   [mlr3tuning](https://cheatsheets.mlr-org.com/mlr3tuning.pdf)
     -   [mlr3pipelines](https://cheatsheets.mlr-org.com/mlr3pipelines.pdf)
+    -   [mlr3fselect](https://cheatsheets.mlr-org.com/mlr3fselect.pdf)
 -   **Videos**:
     -   [useR2019 talk on
         mlr3](https://www.youtube.com/watch?v=wsP2hiFnDQs)
@@ -149,7 +150,7 @@ measure <- msr("classif.acc")
 prediction$score(measure)
 ```
 
-    ## classif.acc 
+    ## classif.acc
     ##   0.9130435
 
 ### Resample
@@ -178,7 +179,7 @@ rr$score(measure)
 rr$aggregate(measure)
 ```
 
-    ## classif.acc 
+    ## classif.acc
     ##    0.918688
 
 ## Extension Packages

diff --git a/man/Task.Rd b/man/Task.Rd
diff --git a/man/TaskClassif.Rd b/man/TaskClassif.Rd
diff --git a/man/TaskRegr.Rd b/man/TaskRegr.Rd
diff --git a/man/TaskSupervised.Rd b/man/TaskSupervised.Rd
diff --git a/man/TaskUnsupervised.Rd b/man/TaskUnsupervised.Rd
diff --git a/man/mlr3-package.Rd b/man/mlr3-package.Rd
diff --git a/man/partition.Rd b/man/partition.Rd
diff --git a/tests/testthat/test_Task.R b/tests/testthat/test_Task.R
@@ -405,26 +405,25 @@ test_that("$add_strata", {
 test_that("column labels", {
   task = tsk("iris")
   expect_character(task$col_info$label)
+  expect_true(allMissing(task$col_info$label))
+  expect_true(allMissing(task$labels))
 
-  labels = c("pl", "pw", "sl", "sw", "species")
-  task$col_info$label = c(NA, labels)
+  task$labels = c(Species = "sp")
+  expect_equal(task$labels[["Species"]], "sp")
+  expect_equal(count_missing(task$labels), 4L)
 
-  task$rbind(iris[1, , drop = FALSE])
-  expect_names(na.omit(task$col_info$label), permutation.of = labels)
+  fn = task$feature_names
+  task$labels = set_names(toupper(fn), fn)
+  expect_equal(unname(task$labels), c("sp", toupper(fn)))
 
-  task$cbind(data.frame(foo = 1:151))
-  task$col_info
-  expect_names(na.omit(task$col_info$label), permutation.of = labels)
+  expect_error({ task$labels = c(foo = "as") }, "names")
 
+  dt = data.table(id = c(task$target_names, task$feature_names))
+  dt$label = tolower(dt$id)
 
-  task = tsk("iris")
-  task$label("Petal.Length", "pl")
-  expect_equal(task$col_info["Petal.Length", label], "pl")
-
-  task$label(c("Sepal.Length", "Sepal.Width"), c("sl", "sw"))
-  expect_equal(task$col_info["Sepal.Length", label], "sl")
-  expect_equal(task$col_info["Sepal.Width", label], "sw")
-
-  task$label("Petal.Length", NA)
-  expect_equal(task$col_info["Petal.Length", label], NA_character_)
+  task$labels = dt
+  expect_equal(
+    unname(task$labels),
+    tolower(c(task$target_names, task$feature_names))
+  )
 })