preparing for release

holub008 · holub008 · commit 01252bc30370 · 2025-12-04T13:14:12.000-06:00
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -21,14 +21,15 @@ Ideally, changes are made according to the following process:
 * Ensure that xrf tests succeed (run via `devtools::test()`)
 * Submit a pull request (this can be done via github UI)
     * Maintainers will provide a code review. Every substantive comment must be addressed before the PR is accepted.
-        * Any follow-on commits to the fork will be reflected in the PR
 * Please bump version numbers (`major.minor.patch`) in `DESCRIPTION` according to the final change made
     * major number for any substantial API or backwards incompatible changes
-    * minor number for any standard change not touching API or compatiility
+    * minor number for any standard change not touching API or compatibility
     * patch number for any bug fixes
 
-### Code style suggestions
-No strict style at current, but please attempt to follow suit with the rest of the project. If in doubt, defer to [Wickham](http://r-pkgs.had.co.nz/r.html#style).
+### Code style
+
+We are informally using the tidy code style the [air](https://posit-dev.github.io/air/formatter.html) formatter.
+Please [install](https://posit-dev.github.io/air/cli.html) `air` and run with `air format .` after making changes.
 
 ### Help with R package development
 If you're new to R package development but want to develop on xrf, both of the following are great resources:
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: xrf
 Title: eXtreme RuleFit
-Version: 0.2.2
+Version: 0.3.0
 Authors@R: 
     person("Karl", "Holub", , "karljholub@gmail.com", role = c("aut", "cre"))
 Description: An implementation of the RuleFit algorithm as described in
diff --git a/R/xrf.R b/R/xrf.R
@@ -36,7 +36,8 @@ condition_xgb_control <- function(
   # xgboost expects multinomial labels to be 0:num_class
   if (
     family == 'multinomial' &&
-      (is.factor(data[[response_var]]) || is.character(data[[response_var]]))
+      (is.factor(data[[response_var]]) ||
+        is.character(data[[response_var]]))
   ) {
     integer_response <- as.integer(as.factor(data[[response_var]]))
     data_mutated[[response_var]] <- integer_response - min(integer_response)
@@ -158,30 +159,31 @@ get_xgboost_objective <- function(family, call = rlang::caller_env()) {
 #############################################
 
 augment_rules <- function(row, rule_ids, less_than) {
-  bind_rows(
-    lapply(rule_ids, function(rule_id) {
-      list(
-        split_id = row$ID,
-        rule_id = rule_id,
-        feature = row$Feature,
-        split = row$Split,
-        less_than = less_than
-      )
-    })
-  )
+  bind_rows(lapply(rule_ids, function(rule_id) {
+    list(
+      split_id = row$ID,
+      rule_id = rule_id,
+      feature = row$Feature,
+      split = row$Split,
+      less_than = less_than
+    )
+  }))
 }
 
 # this is of course slow, but it shouldn't be a bottleneck due to ensembles generally small and tree depth < 6
 rule_traverse <- function(row, tree) {
   if (row$Feature == 'Leaf') {
-    return(data.frame(
-      split_id = row$ID,
-      rule_id = paste0('r', gsub('-', '_', row$ID)), # leaf nodes uniquely identify a rule
-      feature = NA,
-      split = NA,
-      less_than = NA,
-      stringsAsFactors = FALSE
-    ))
+    return(
+      data.frame(
+        split_id = row$ID,
+        rule_id = paste0('r', gsub('-', '_', row$ID)),
+        # leaf nodes uniquely identify a rule
+        feature = NA,
+        split = NA,
+        less_than = NA,
+        stringsAsFactors = FALSE
+      )
+    )
   } else {
     # the Yes/No obfuscates the simplicity of the algo - in order tree traversal
     left_child <- tree[tree$ID == row$Yes, ]
@@ -204,13 +206,15 @@ rule_traverse <- function(row, tree) {
       less_than = FALSE
     )
 
-    return(rbind(
-      left_rules_augmented,
-      right_rules_augmented,
-      left_rules,
-      right_rules,
-      stringsAsFactors = FALSE
-    ))
+    return(
+      rbind(
+        left_rules_augmented,
+        right_rules_augmented,
+        left_rules,
+        right_rules,
+        stringsAsFactors = FALSE
+      )
+    )
   }
 }
 
@@ -263,13 +267,14 @@ build_feature_metadata <- function(data) {
       !is.numeric(x)
     }) |>
     lapply(function(x) {
-      if (is.factor(x)) levels(x) else as.character(unique(x))
+      if (is.factor(x)) {
+        levels(x)
+      } else {
+        as.character(unique(x))
+      }
     })
 
-  list(
-    xlev = xlev,
-    feature_metadata = feature_metadata
-  )
+  list(xlev = xlev, feature_metadata = feature_metadata)
 }
 
 has_matching_level <- function(feature_name, level_remainder, xlev) {
@@ -542,7 +547,11 @@ xrf.formula <- function(
     prefit_xgb
   )
 
-  model_matrix_method <- if (sparse) sparse.model.matrix else model.matrix
+  model_matrix_method <- if (sparse) {
+    sparse.model.matrix
+  } else {
+    model.matrix
+  }
   design_matrix <- model_matrix_method(expanded_formula, data)
 
   nrounds <- xgb_control$nrounds
@@ -624,7 +633,8 @@ xrf.formula <- function(
     full_formula,
     full_data,
     family = family,
-    alpha = 1, # this specifies the LASSO
+    alpha = 1,
+    # this specifies the LASSO
     sparse = sparse,
     glm_control = glm_control
   )
@@ -665,7 +675,11 @@ model.matrix.xrf <- function(object, data, sparse = TRUE, ...) {
   trms <- terms(object$base_formula)
   trms <- delete.response(trms)
 
-  design_matrix_method <- if (sparse) sparse.model.matrix else model.matrix
+  design_matrix_method <- if (sparse) {
+    sparse.model.matrix
+  } else {
+    model.matrix
+  }
 
   raw_design_matrix <- design_matrix_method(trms, data)
   rules_features <- if (sparse) {
@@ -755,9 +769,7 @@ coef.xrf <- function(object, lambda = 'lambda.min', ...) {
   glm_df |>
     left_join(rule_conjunctions, by = c('term' = 'rule_id')) |>
     arrange_at(colnames(glm_df[1])) |>
-    mutate(
-      rule = conjunction
-    ) |>
+    mutate(rule = conjunction) |>
     select(-conjunction)
 }
 
diff --git a/README.md b/README.md
@@ -39,15 +39,13 @@ The general algorithm follows:
     * For a description of this algorithm, see [this document](https://github.com/holub008/snippets/blob/master/overlapped_hyperrectangles/overlapped_hyperrectangles.pdf)
 
 ### Comparison to alternatives
-Several implementations of RuleFit are available for R: [pre](https://CRAN.R-project.org/package=pre), [horserule](https://CRAN.R-project.org/package=horserule), and [rulefit](https://github.com/gravesee/rulefit). xrf improves on some aspects of these by:
+Several implementations of RuleFit are available for R: [pre](https://CRAN.R-project.org/package=pre), (once upon a time) [horserule](https://CRAN.R-project.org/package=horserule), and [rulefit](https://github.com/gravesee/rulefit). xrf improves on some aspects of these by:
 * Usually building more accurate models at fixed number of parameters
 * Usually building models faster
 * Building models that predict for new factor-levels
 * Providing a more concise and limited interface
 * Tested & actively maintained for fewer bugs
 
-On the last point, as of April 2019, the 'pre' and 'rulefit' packages fail to build a model on the census income example below due to bugs.
-
 ## Example
 
 Here we predict whether an individual's income is greater than $50,000 using census data.
@@ -232,4 +230,3 @@ How slick is that! We have:
 Effects are immediately available by doing a lookup in the exclusive rules. This is a great win for interpretability.
 
 As mentioned above, this example is contrived in that it uses `depth=1` trees (i.e. conjunctions of size 1). As depth increases, interpretability can suffer regardless de-overlapping if the final ruleset is non-sparse. However, for certain problems, particularly small depth or sparse effects, de-overlapping can be a boon for interpretability.
-