From 97788de764ff12b32b29ef25b440b42494846689 Mon Sep 17 00:00:00 2001 From: Srikanth K S Date: Tue, 25 Jun 2024 19:33:04 +0530 Subject: [PATCH] fixed confidence issue, added ruleset class (#31) --- .Rbuildignore | 2 + .gitignore | 3 + DESCRIPTION | 4 +- NAMESPACE | 3 + R/package.R | 17 +-- R/rulelist.R | 81 ++++++----- R/ruleset.R | 109 +++++++++++++++ R/utils.R | 8 +- man/as_ruleset.Rd | 27 ++++ man/package_tidyrules.Rd | 17 +-- man/predict.rulelist.Rd | 10 +- man/predict.ruleset.Rd | 40 ++++++ man/print.rulelist.Rd | 4 +- man/print.ruleset.Rd | 31 +++++ man/ruleset.Rd | 9 ++ man/set_keys.Rd | 1 - man/set_validation_data.Rd | 1 - man/to_sql_case.Rd | 4 +- tests/testthat/test-rulelist.R | 4 +- vignettes/tidyrules_vignette.Rmd | 225 +------------------------------ 20 files changed, 306 insertions(+), 294 deletions(-) create mode 100644 R/ruleset.R create mode 100644 man/as_ruleset.Rd create mode 100644 man/predict.ruleset.Rd create mode 100644 man/print.ruleset.Rd create mode 100644 man/ruleset.Rd diff --git a/.Rbuildignore b/.Rbuildignore index 062ef56..e840dee 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,5 @@ +^renv$ +^renv\.lock$ ^\.travis\.yml$ ^.*\.Rproj$ ^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore index 5b6a065..e191e14 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ .Rhistory .RData .Ruserdata +.Rprofile +renv* +*.Rproj diff --git a/DESCRIPTION b/DESCRIPTION index 0944d41..7f38cc5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: tidyrules Type: Package Title: Utilities to Retrieve Rulelists from Model Fits, Filter, Prune, Reorder and Predict on unseen data -Version: 0.2.5 +Version: 0.2.6 Authors@R: c( person("Srikanth", "Komala Sheshachala", email = "sri.teach@gmail.com", role = c("aut", "cre")), person("Amith Kumar", "Ullur Raghavendra", email = "amith54@gmail.com", role = c("aut")) @@ -38,7 +38,7 @@ Suggests: knitr (>= 1.23), rmarkdown (>= 1.13), palmerpenguins (>= 0.1.1), -Description: Extract rules as a rulelist (a class based on dataframe) along with metrics per rule such as support, confidence, lift, RMSE, IQR. Rulelists can be augmented using validation data, manipulated using standard dataframe operations, rulelists can be used to predict on unseen data, prune them based on some metrics and reoder them to optimize them for a metric. Utilities include manually creating rulesets, exporting a rulelist to SQL syntax and so on. +Description: Provides a framework to work with decision rules. Rules can be extracted from supported models, augmented with (custom) metrics using validation data, manipulated using standard dataframe operations, reordered and pruned based on a metric, predict on unseen (test) data. Utilities include; Creating a rulelist manually, Exporting a rulelist as a SQL case statement and so on. The package offers two classes; rulelist and rulelset based on dataframe. URL: https://github.com/talegari/tidyrules BugReports: https://github.com/talegari/tidyrules/issues License: GPL-3 diff --git a/NAMESPACE b/NAMESPACE index ae56e71..b39b6d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,8 +6,10 @@ S3method(calculate,rulelist) S3method(plot,prune_rulelist) S3method(plot,rulelist) S3method(predict,rulelist) +S3method(predict,ruleset) S3method(print,prune_rulelist) S3method(print,rulelist) +S3method(print,ruleset) S3method(prune,rulelist) S3method(reorder,rulelist) S3method(tidy,C5.0) @@ -15,6 +17,7 @@ S3method(tidy,constparty) S3method(tidy,cubist) S3method(tidy,rpart) export(as_rulelist) +export(as_ruleset) export(augment) export(calculate) export(convert_rule_flavor) diff --git a/R/package.R b/R/package.R index 0f2060c..2ba6858 100644 --- a/R/package.R +++ b/R/package.R @@ -6,14 +6,15 @@ #' @name package_tidyrules #' @title `tidyrules` #' @description `tidyrules` package provides a framework to work with decision -#' rules stored as a [rulelist] backed by a tidy dataframe. Rules can be -#' extracted from supported models using [tidy], augmented using validation data -#' by [augment][augment.rulelist], manipulated using standard dataframe -#' operations, (modified) rulelists can be used to [predict][predict.rulelist] -#' on unseen (test) data. Utilities include: Create a rulelist -#' manually ([as_rulelist][as_rulelist.data.frame]), Export a rulelist to SQL -#' ([to_sql_case]) and so on. -#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist] +#' rules. Rules can be extracted from supported models using [tidy], augmented +#' using validation data by [augment][augment.rulelist], manipulated using +#' standard dataframe operations, (modified) rulelists can be used to +#' [predict][predict.rulelist] on unseen (test) data. Utilities include: +#' Create a rulelist manually ([as_rulelist][as_rulelist.data.frame]), Export +#' a rulelist to SQL ([to_sql_case]) and so on. The package offers two +#' classes; [rulelist] and [ruleset] based on dataframe. +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], +#' [predict][predict.rulelist] #' @importFrom magrittr %>% #' @importFrom rlang %||% #' @importFrom data.table := diff --git a/R/rulelist.R b/R/rulelist.R index 1fb3f63..141f203 100644 --- a/R/rulelist.R +++ b/R/rulelist.R @@ -359,23 +359,27 @@ set_validation_data = function(x, validation_data, y_name, weight = 1){ #' @title Print method for [rulelist] class #' @description Prints [rulelist] attributes and first few rows. #' @param x A [rulelist] object +#' @param banner (flag, default: `TRUE`) Should the banner be displayed #' @param ... Passed to `tidytable::print` #' @return input [rulelist] (invisibly) #' @seealso [rulelist], [tidy], [augment][augment.rulelist], #' [predict][predict.rulelist], [calculate][calculate.rulelist], #' [prune][prune.rulelist], [reorder][reorder.rulelist] #' @export -print.rulelist = function(x, ...){ +print.rulelist = function(x, banner = TRUE, ...){ validate_rulelist(x) + rulelist = rlang::duplicate(x) - keys = attr(x, "keys") - estimation_type = attr(x, "estimation_type") - model_type = attr(x, "model_type") - validation_data = attr(x, "validation_data") + keys = attr(rulelist, "keys") + estimation_type = attr(rulelist, "estimation_type") + model_type = attr(rulelist, "model_type") + validation_data = attr(rulelist, "validation_data") - cli::cli_rule(left = "Rulelist") - cli::cli_text("") + if (banner) { + cli::cli_rule(left = "Rulelist") + cli::cli_text("") + } if (is.null(keys)) { cli::cli_alert_info("{.emph Keys}: {.strong NULL}") @@ -407,10 +411,13 @@ print.rulelist = function(x, ...){ cli::cli_text("") - class(x) = setdiff(class(x), "rulelist") - print(x, ...) - cli::cli_rule() - class(x) = c("rulelist", class(x)) + class(rulelist) = setdiff(class(rulelist), "rulelist") + # now 'rulelist' is a dataframe and not a 'rulelist' + print(rulelist, ...) + + if (banner) { + cli::cli_rule() + } return(invisible(x)) } @@ -706,20 +713,21 @@ predict_rulelist = function(rulelist, new_data){ #' @returns A dataframe. See **Details**. #' #' @details If a `row_nbr` is covered more than one `rule_nbr` per 'keys', then -#' `rule_nbr` appearing earlier (as in row order of the [rulelist]) takes -#' precedence. +#' `rule_nbr` appearing earlier (as in row order of the [rulelist]) takes +#' precedence. #' #' ## Output Format #' #' - When multiple is `FALSE`(default), output is a dataframe with three -#' or more columns: `row_number` (int), columns corresponding to 'keys', -#' `rule_nbr` (int). +#' or more columns: `row_number` (int), columns corresponding to 'keys', +#' `rule_nbr` (int). #' -#' - When multiple is `TRUE`(default), output is a tidytable/dataframe with three -#' or more columns: `row_number` (int), columns corresponding to 'keys', -#' `rule_nbr` (list column of integers). +#' - When multiple is `TRUE`, output is a dataframe with three +#' or more columns: `row_number` (int), columns corresponding to 'keys', +#' `rule_nbr` (list column of integers). #' -#' - If a row number and 'keys' combination is not covered by any rule, then `rule_nbr` column has missing value. +#' - If a row number and 'keys' combination is not covered by any rule, then +#' `rule_nbr` column has missing value. #' #' @examples #' model_c5 = C50::C5.0(species ~., @@ -740,7 +748,6 @@ predict_rulelist = function(rulelist, new_data){ #' [predict][predict.rulelist], [calculate][calculate.rulelist], #' [prune][prune.rulelist], [reorder][reorder.rulelist] #' @importFrom stats predict -#' @family Core Rulelist Utility #' @export #' predict.rulelist = function(object, new_data, multiple = FALSE, ...){ @@ -790,22 +797,23 @@ augment_class_no_keys = function(x, new_data, y_name, weight, ...){ mutate(prevalence = prevalence_0 / sum(prevalence_0)) %>% select(all_of(c(eval(y_name), "prevalence"))) + na_to_false = function(x) ifelse(is.na(x), FALSE, x) + aggregatees_df = new_data_with_rule_nbr %>% # bring 'prevalence' column left_join(prevalence_df,by = eval(y_name)) %>% summarise( support = sum(weight__, na.rm = TRUE), - confidence = weighted.mean(ifelse(is.na(eval(y_name) == RHS), FALSE, TRUE), - weight__, - na.rm = TRUE - ), - lift = weighted.mean(ifelse(is.na(eval(y_name) == RHS), FALSE, TRUE), - weight__, - na.rm = TRUE - ) / prevalence[1], + confidence = + ( as.character(.data[[y_name]]) == as.character(RHS) ) %>% + na_to_false() %>% + weighted.mean(weight__, na.rm = TRUE), + prevalence = prevalence[1], .by = rule_nbr ) %>% + mutate(lift = confidence / prevalence) %>% + select(-prevalence) %>% nest(.by = rule_nbr, .key = "augmented_stats") # output has all columns of 'tidy' along with 'augment_stats' @@ -858,23 +866,24 @@ augment_class_keys = function(x, new_data, y_name, weight, ...){ ) %>% select(all_of(c(keys, eval(y_name), "prevalence"))) + na_to_false = function(x) ifelse(is.na(x), FALSE, x) + # add aggregates at rule_nbr and 'keys' level aggregatees_df = new_data_with_rule_nbr %>% left_join(prevalence_df, by = c(keys, eval(y_name))) %>% summarise( support = sum(weight__, na.rm = TRUE), - confidence = weighted.mean(ifelse(is.na(eval(y_name) == RHS), FALSE, TRUE), - weight__, - na.rm = TRUE - ), - lift = weighted.mean(ifelse(is.na(eval(y_name) == RHS), FALSE, TRUE), - weight__, - na.rm = TRUE - ) / prevalence[1], + confidence = + ( as.character(.data[[y_name]]) == as.character(RHS) ) %>% + na_to_false() %>% + weighted.mean(weight__, na.rm = TRUE), + prevalence = prevalence[1], ..., .by = c(keys, "rule_nbr") ) %>% + mutate(lift = confidence / prevalence) %>% + select(-prevalence) %>% nest(.by = c("rule_nbr", keys), .key = "augmented_stats") # output has all columns of 'tidy' along with 'augment_stats' diff --git a/R/ruleset.R b/R/ruleset.R new file mode 100644 index 0000000..c5f2097 --- /dev/null +++ b/R/ruleset.R @@ -0,0 +1,109 @@ +#******************************************************************************* +# This is the part of the 'tidyrules' R package hosted at +# https://github.com/talegari/tidyrules with GPL-3 license. +#******************************************************************************* + +#' @name ruleset +#' @title Ruleset +#' @description ruleset class is a piggyback class that inherits [rulelist] +#' class for convenience of [print] and [predict] methods. +identity # just a placeholder for 'ruleset' documentation, not exported + +#' @name as_ruleset +#' @title Get a ruleset from a rulelist +#' @description Returns a ruleset object +#' @param rulelist A [rulelist] +#' @returns A [ruleset] +#' +#' @examples +#' model_class_party = partykit::ctree(species ~ ., +#' data = palmerpenguins::penguins +#' ) +#' as_ruleset(tidy(model_class_party)) +#' +#' @seealso [rulelist] +#' @export +as_ruleset = function(rulelist){ + + validate_rulelist(rulelist) + + x = rlang::duplicate(rulelist) + class(x) = c("ruleset", class(x)) + + return(x) +} + +#' @name print.ruleset +#' @title Print method for ruleset class +#' @description Prints the ruleset object +#' @param x A [rulelist] +#' @param banner (flag, default: `TRUE`) Should the banner be displayed +#' @param ... Passed to `print.rulelist` +#' @returns (invisibly) Returns the ruleset object +#' +#' @examples +#' model_class_party = partykit::ctree(species ~ ., +#' data = palmerpenguins::penguins +#' ) +#' as_ruleset(tidy(model_class_party)) +#' +#' @seealso [print.rulelist] +#' @export +print.ruleset = function(x, banner = TRUE, ...){ + + ruleset = rlang::duplicate(x) + + if (banner) { + cli::cli_rule(left = "Ruleset") + cli::cli_text("") + } + + class(ruleset) = setdiff(class(ruleset), "ruleset") + # now 'ruleset' is a rulelist + print(ruleset, banner = FALSE, ...) + + if (banner) { + cli::cli_rule() + } + + return(invisible(x)) +} + +#' @name predict.ruleset +#' @title `predict` method for a [ruleset] +#' @description Predicts multiple `rule_nbr`(s) applicable for a `row_nbr` (per +#' key) in new_data +#' +#' @param object A [ruleset] +#' @param new_data (dataframe) +#' @param ... unused +#' +#' @returns A dataframe with three or more columns: `row_number` (int), columns +#' corresponding to 'keys', `rule_nbr` (list column of integers). If a row +#' number and 'keys' combination is not covered by any rule, then `rule_nbr` +#' column has missing value. +#' +#' @examples +#' model_c5 = C50::C5.0(species ~., +#' data = palmerpenguins::penguins, +#' trials = 5, +#' rules = TRUE +#' ) +#' tidy_c5_ruleset = as_ruleset(tidy(model_c5)) +#' tidy_c5_ruleset +#' +#' predict(tidy_c5_ruleset, palmerpenguins::penguins) +#' +#' @seealso [predict.rulelist] +#' @importFrom stats predict +#' @export +predict.ruleset = function(object, new_data, ...){ + + x = rlang::duplicate(object) + class(x) = setdiff(class(x), "ruleset") + + # now 'ruleset' is a rulelist + res = predict(x, new_data, multiple = TRUE, ...) + + return(res) +} diff --git a/R/utils.R b/R/utils.R index d2787c3..f48474b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,7 +1,7 @@ -################################################################################ +#******************************************************************************* # This is the part of the 'tidyrules' R package hosted at # https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ +#******************************************************************************* #' @keywords internal #' @name positionSpaceOutsideSinglequotes @@ -334,10 +334,10 @@ convert_rule_flavor = function(rule, flavor){ #' @description Extract SQL case statement from a [rulelist] #' @param rulelist A [rulelist] object #' @param rhs_column_name (string, default: "RHS") Name of the column in the -#' rulelist to be used as RHS (WHEN THEN {rhs}) in the sql case +#' rulelist to be used as RHS (WHEN some_rule THEN rhs) in the sql case #' statement #' @param output_colname (string, default: "output") Name of the output column -#' created by the SQL statement (used in case ... AS {output_column}) +#' created by the SQL statement (used in case ... AS output_column) #' @return (string invisibly) SQL case statement #' @details As a side-effect, the SQL statement is cat to stdout. The output #' contains newline character. diff --git a/man/as_ruleset.Rd b/man/as_ruleset.Rd new file mode 100644 index 0000000..4c0c49e --- /dev/null +++ b/man/as_ruleset.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ruleset.R +\name{as_ruleset} +\alias{as_ruleset} +\title{Get a ruleset from a rulelist} +\usage{ +as_ruleset(rulelist) +} +\arguments{ +\item{rulelist}{A \link{rulelist}} +} +\value{ +A \link{ruleset} +} +\description{ +Returns a ruleset object +} +\examples{ +model_class_party = partykit::ctree(species ~ ., + data = palmerpenguins::penguins + ) +as_ruleset(tidy(model_class_party)) + +} +\seealso{ +\link{rulelist} +} diff --git a/man/package_tidyrules.Rd b/man/package_tidyrules.Rd index a874e35..f265093 100644 --- a/man/package_tidyrules.Rd +++ b/man/package_tidyrules.Rd @@ -8,16 +8,17 @@ \title{\code{tidyrules}} \description{ \code{tidyrules} package provides a framework to work with decision -rules stored as a \link{rulelist} backed by a tidy dataframe. Rules can be -extracted from supported models using \link{tidy}, augmented using validation data -by \link[=augment.rulelist]{augment}, manipulated using standard dataframe -operations, (modified) rulelists can be used to \link[=predict.rulelist]{predict} -on unseen (test) data. Utilities include: Create a rulelist -manually (\link[=as_rulelist.data.frame]{as_rulelist}), Export a rulelist to SQL -(\link{to_sql_case}) and so on. +rules. Rules can be extracted from supported models using \link{tidy}, augmented +using validation data by \link[=augment.rulelist]{augment}, manipulated using +standard dataframe operations, (modified) rulelists can be used to +\link[=predict.rulelist]{predict} on unseen (test) data. Utilities include: +Create a rulelist manually (\link[=as_rulelist.data.frame]{as_rulelist}), Export +a rulelist to SQL (\link{to_sql_case}) and so on. The package offers two +classes; \link{rulelist} and \link{ruleset} based on dataframe. } \seealso{ -\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict} +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, +\link[=predict.rulelist]{predict} } \author{ \strong{Maintainer}: Srikanth Komala Sheshachala \email{sri.teach@gmail.com} diff --git a/man/predict.rulelist.Rd b/man/predict.rulelist.Rd index 08a8eb1..4f341e9 100644 --- a/man/predict.rulelist.Rd +++ b/man/predict.rulelist.Rd @@ -32,10 +32,11 @@ precedence. \item When multiple is \code{FALSE}(default), output is a dataframe with three or more columns: \code{row_number} (int), columns corresponding to 'keys', \code{rule_nbr} (int). -\item When multiple is \code{TRUE}(default), output is a tidytable/dataframe with three +\item When multiple is \code{TRUE}, output is a dataframe with three or more columns: \code{row_number} (int), columns corresponding to 'keys', \code{rule_nbr} (list column of integers). -\item If a row number and 'keys' combination is not covered by any rule, then \code{rule_nbr} column has missing value. +\item If a row number and 'keys' combination is not covered by any rule, then +\code{rule_nbr} column has missing value. } } } @@ -59,9 +60,4 @@ output_2 # `rule_nbr` is a list-column of integer vectors \link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict}, \link[=calculate.rulelist]{calculate}, \link[=prune.rulelist]{prune}, \link[=reorder.rulelist]{reorder} - -Other Core Rulelist Utility: -\code{\link{set_keys}()}, -\code{\link{set_validation_data}()} } -\concept{Core Rulelist Utility} diff --git a/man/predict.ruleset.Rd b/man/predict.ruleset.Rd new file mode 100644 index 0000000..cd99111 --- /dev/null +++ b/man/predict.ruleset.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ruleset.R +\name{predict.ruleset} +\alias{predict.ruleset} +\title{\code{predict} method for a \link{ruleset}} +\usage{ +\method{predict}{ruleset}(object, new_data, ...) +} +\arguments{ +\item{object}{A \link{ruleset}} + +\item{new_data}{(dataframe)} + +\item{...}{unused} +} +\value{ +A dataframe with three or more columns: \code{row_number} (int), columns +corresponding to 'keys', \code{rule_nbr} (list column of integers). If a row +number and 'keys' combination is not covered by any rule, then \code{rule_nbr} +column has missing value. +} +\description{ +Predicts multiple \code{rule_nbr}(s) applicable for a \code{row_nbr} (per +key) in new_data +} +\examples{ +model_c5 = C50::C5.0(species ~., + data = palmerpenguins::penguins, + trials = 5, + rules = TRUE + ) +tidy_c5_ruleset = as_ruleset(tidy(model_c5)) +tidy_c5_ruleset + +predict(tidy_c5_ruleset, palmerpenguins::penguins) + +} +\seealso{ +\link{predict.rulelist} +} diff --git a/man/print.rulelist.Rd b/man/print.rulelist.Rd index b27443d..285257a 100644 --- a/man/print.rulelist.Rd +++ b/man/print.rulelist.Rd @@ -4,11 +4,13 @@ \alias{print.rulelist} \title{Print method for \link{rulelist} class} \usage{ -\method{print}{rulelist}(x, ...) +\method{print}{rulelist}(x, banner = TRUE, ...) } \arguments{ \item{x}{A \link{rulelist} object} +\item{banner}{(flag, default: \code{TRUE}) Should the banner be displayed} + \item{...}{Passed to \code{tidytable::print}} } \value{ diff --git a/man/print.ruleset.Rd b/man/print.ruleset.Rd new file mode 100644 index 0000000..8f0762f --- /dev/null +++ b/man/print.ruleset.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ruleset.R +\name{print.ruleset} +\alias{print.ruleset} +\title{Print method for ruleset class} +\usage{ +\method{print}{ruleset}(x, banner = TRUE, ...) +} +\arguments{ +\item{x}{A \link{rulelist}} + +\item{banner}{(flag, default: \code{TRUE}) Should the banner be displayed} + +\item{...}{Passed to \code{print.rulelist}} +} +\value{ +(invisibly) Returns the ruleset object +} +\description{ +Prints the ruleset object +} +\examples{ +model_class_party = partykit::ctree(species ~ ., + data = palmerpenguins::penguins + ) +as_ruleset(tidy(model_class_party)) + +} +\seealso{ +\link{print.rulelist} +} diff --git a/man/ruleset.Rd b/man/ruleset.Rd new file mode 100644 index 0000000..67265b4 --- /dev/null +++ b/man/ruleset.Rd @@ -0,0 +1,9 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ruleset.R +\name{ruleset} +\alias{ruleset} +\title{Ruleset} +\description{ +ruleset class is a piggyback class that inherits \link{rulelist} +class for convenience of \link{print} and \link{predict} methods. +} diff --git a/man/set_keys.Rd b/man/set_keys.Rd index f8c4cf1..3b45826 100644 --- a/man/set_keys.Rd +++ b/man/set_keys.Rd @@ -49,7 +49,6 @@ new_tidy_c5 \link[=prune.rulelist]{prune}, \link[=reorder.rulelist]{reorder} Other Core Rulelist Utility: -\code{\link{predict.rulelist}()}, \code{\link{set_validation_data}()} } \concept{Core Rulelist Utility} diff --git a/man/set_validation_data.Rd b/man/set_validation_data.Rd index 9192b34..2034795 100644 --- a/man/set_validation_data.Rd +++ b/man/set_validation_data.Rd @@ -52,7 +52,6 @@ tidy_c5 # not altered \link[=prune.rulelist]{prune}, \link[=reorder.rulelist]{reorder} Other Core Rulelist Utility: -\code{\link{predict.rulelist}()}, \code{\link{set_keys}()} } \concept{Core Rulelist Utility} diff --git a/man/to_sql_case.Rd b/man/to_sql_case.Rd index 58f756c..abad4f3 100644 --- a/man/to_sql_case.Rd +++ b/man/to_sql_case.Rd @@ -10,11 +10,11 @@ to_sql_case(rulelist, rhs_column_name = "RHS", output_colname = "output") \item{rulelist}{A \link{rulelist} object} \item{rhs_column_name}{(string, default: "RHS") Name of the column in the -rulelist to be used as RHS (WHEN \if{html}{\out{}} THEN {rhs}) in the sql case +rulelist to be used as RHS (WHEN some_rule THEN rhs) in the sql case statement} \item{output_colname}{(string, default: "output") Name of the output column -created by the SQL statement (used in case ... AS {output_column})} +created by the SQL statement (used in case ... AS output_column)} } \value{ (string invisibly) SQL case statement diff --git a/tests/testthat/test-rulelist.R b/tests/testthat/test-rulelist.R index ec02aa9..b5331de 100644 --- a/tests/testthat/test-rulelist.R +++ b/tests/testthat/test-rulelist.R @@ -1,7 +1,7 @@ -################################################################################ +#******************************************************************************* # This is the part of the 'tidyrules' R package hosted at # https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ +#******************************************************************************* context("test-rulelist") diff --git a/vignettes/tidyrules_vignette.Rmd b/vignettes/tidyrules_vignette.Rmd index e7b33f8..34bb45b 100644 --- a/vignettes/tidyrules_vignette.Rmd +++ b/vignettes/tidyrules_vignette.Rmd @@ -1,5 +1,5 @@ --- -title: "Vignette_tidyrules" +title: "Using tidyrules" author: "Srikanth KS, Amith Kumar UR" date: "`r Sys.Date()`" output: @@ -12,224 +12,5 @@ vignette: > %\VignetteIndexEntry{Using tidyrules} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} -abstract: The package [`tidyRules`](https://cran.r-project.org/package=tidyrules) is meant to extract **parsable** rules from model objects to a `tibble/data.frame` format. Package supports the following models - `C5.0`, `rpart` and `cubist`. The output rules are parsable by R, python (pandas query) and SQL (with WHERE clause). ---- -## Quick-start -```{r,warning=FALSE,echo=TRUE,message=FALSE} -library("tidyrules") -library("dplyr") -library("C50") -library("pander") - -# build model -c5_model <- C5.0(Species ~ ., data = iris, rules = TRUE) - -# extract rules in a tidy tibble -tidy_rules <- tidyRules(c5_model) - -# View tidy_rules -tidy_rules %>% - select(-c(rule_number,trial_number)) %>% - pandoc.table() -``` - -**Filter rules based on `RHS` or `support` or `confidence` or `lift` :** -```{r,warning=FALSE,echo=TRUE,message=FALSE} -# Example 1, filter rules based on support -tidy_rules %>% - filter(support >= 48) %>% - select(LHS, RHS) - - -# Example 2, filter rules based on RHS -tidy_rules %>% - filter(RHS == "virginica") %>% - select(LHS, support, confidence, lift) -``` - -**Use a `tidyrule` in a `filter()` function :** -```{r,warning=FALSE,echo=TRUE,message=FALSE} -iris %>% - filter(eval(parse(text = tidy_rules[3,"LHS"]))) %>% # filter using a C5 rule - count(Species) -``` - -## Extracting rules using [`tidyrules`](https://cran.r-project.org/package=tidyrules) -### Example: Classification using `C5.0` -In this example we use `attrition` data from `rsample` package. This -illustration shows how to extract rules from `C5.0` model and applying `filter()` -based on [**tidyrules**](https://cran.r-project.org/package=tidyrules). - -```{r,warning=FALSE,echo=TRUE,message=FALSE} -# loading packages -library("tidyrules") -library("C50") -library("dplyr") - -# attrition data load -data("attrition", package = "modeldata") -attrition <- as_tibble(attrition) - -glimpse(attrition) -``` - -As you could see, there are 31 variables and 1470 observations are present this -data-set. Here our aim is to predict Attrition using rest of the variables. Let -us build a `C5.0` model first. -```{r,warning=FALSE,echo=TRUE,message=FALSE} -# our C5 model -c5_att <- C5.0(Attrition ~ ., data = attrition, rules = TRUE) - -# sample rules from C5 -c5_att$output %>% - stringr::str_sub(start = 194L - , end = 578L) %>% - writeLines() -``` - -We get nice and human readable rules. Now problem with `C5.0` summary is, you -can only read and get a feel of how your predictions made based on rules. But -here comes the hard part, imagine if you want to explore further about your data -and you want to dig deeper, if you want to know rules which are throwing high -lift and confidence, or you may be interested in rules which covers major -sub-population. If in case your model is giving too many rules then that is the -hardest part to go through each and every rules and identifying best rules out -of the summary. - -What if we have all the rules in a tidy table format so that we could easily use -them on the data. Let's get it done using `tidyRules`. - -```{r,warning=FALSE,echo=TRUE,message=FALSE} -# Extract rules to a tidy tibble -tr_att <- tidyRules(c5_att) - -tr_att -``` -__`tidyRules` important columns to notice :__ - - * `LHS` : Rules. - * `RHS` : Predicted Class. - * `support` : Number of observation covered by the rule. - * `confidence` : Prediction accuracy for respective class. (laplace correction is implemented by default) - * `lift` : The result of dividing the rule's estimated accuracy by the relative frequency of the predicted class in the training set. - -Let's have a look at first five rules - -```{r} -tr_att %>% - head(5) %>% - select(LHS,RHS) %>% - pandoc.table(split.cells = 60) -``` - -Now, all the rules are in `tibble` (a _tidy_ form of `dataframe`) format. Let us -look at rules which favors only Attrition is equal to "No" and arrange by -support. - -```{r,warning=FALSE,echo=TRUE,message=FALSE} -rules_example_1 <- tr_att %>% - filter(RHS == "No") %>% - arrange(desc(support)) - -rules_example_1 -``` - -#### Use rules inside `filter()` function. -Let's use a rule within a `filter()`. Say, one need to pick a rule which has -largest `support` for predicted Attrition "Yes". - -```{r,warning=FALSE,echo=TRUE,message=FALSE} -# filter a rule with conditions -large_support_rule <- tr_att %>% - filter(RHS == "Yes") %>% - top_n(1, wt = support) %>% - pull(LHS) - -# parseable rule -parseable_rule <- parse(text = large_support_rule) - -# apply filter on data frame using parseable rule -attrition %>% - filter(eval(parseable_rule)) -``` - -#### Rules parsable by python and SQL -```{r,warning=FALSE,echo=TRUE,message=FALSE} -tr_att_python <- tidyRules(c5_att, language = "python") -tr_att_sql <- tidyRules(c5_att, language = "sql") - -head(tr_att_python$LHS) -head(tr_att_sql$LHS) -``` - -### Example: Classification using `rpart` -In this example we will be using `BreastCancer` data from `mlbench` package. -```{r,warning=FALSE,echo=TRUE,message=FALSE} -library("tidyrules") -library("dplyr") -library("rpart") -# BreastCancer -data(BreastCancer, package = "mlbench") -bc_train <- BreastCancer %>% - select(-Id) %>% - mutate_if(is.ordered, function(x) x <- factor(x,ordered = F)) - -rpart_bc <- rpart(Class ~ ., data = bc_train) -``` - -*__NOTE__ : Do not forget to convert all `ordered` features to `factor` type -before training the model.* - -One could visualize rpart decision tree using `prp` function from `rpart.plot` package. -```{r,warning=FALSE,echo=TRUE,message=FALSE} -library("rpart.plot") -prp(rpart_bc) -``` - -The above tree visual is really nice to get a hang of how decision tree is splitting at each node. But, if you want to pick a terminal node it is really boring and hard -since one has to enter the respective filter manually (imagine a situation if you have hundreds of features and a huge tree!!). To get-ride of this problem -one could use tidyrules to make life easier. - -Let's extract rules from `rpart` object and use those rules further more to extract terminal nodes. - -```{r,warning=FALSE,echo=TRUE,message=FALSE} -# tidyrule extract -rules_bc <- tidyRules(rpart_bc) - -rules_bc - -# filter the data using a rule -bc_train %>% - filter(eval(parse(text = rules_bc[5,"LHS"]))) %>% - as_tibble() -``` - -### Example: Regression using `Cubist` -In this example, rules extraction from a regression model (a `cubist` model) has -been illustrated below. We will be using `AmesHousing` dataset for the example. - -```{r,warning=FALSE,echo=TRUE,message=FALSE} -library("tidyrules") -library("dplyr") -library("Cubist") -# ames housing data set -ames <- AmesHousing::make_ames() -cubist_ames <- cubist(x = ames[, setdiff(colnames(ames), c("Sale_Price"))], - y = log10(ames[["Sale_Price"]]), - committees = 3 - ) - -# rule extract -rules_ames <- tidyRules(cubist_ames) - -rules_ames -``` - -Notice that, for `cubist` rules we have `mean`, `min`, `max` and `error` instead -of `confidence` and `lift`. Here `mean`, `min` and `max` are calculated based on predicted values with respect to a rule. - -## Useful links and References - -* C5.0: An Informal Tutorial -* Recursive partitioning for classification, regression and survival trees -* Data Mining with Cubist +abstract: "tidyrules package provides a framework to work with decision rules. Rules can be extracted from supported models, augmented with (custom) metrics using validation data, manipulated using standard dataframe operations, reordered and pruned based on a metric, predict on unseen (test) data. Utilities include; Creating a rulelist manually, Exporting a rulelist as a SQL case statement and so on. The package offers two classes; rulelist and rulelset based on dataframe." +--- \ No newline at end of file