diff --git a/.gitignore b/.gitignore index e44fe857..23cadad1 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,6 @@ vignettes/varpro.html .claude .positai + +# Local dev tool state (brainstorm/superpowers) +.superpowers/ diff --git a/NAMESPACE b/NAMESPACE index e69c1d08..b4680c50 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand S3method(autoplot,gg_auct) +S3method(autoplot,gg_beta_uvarpro) S3method(autoplot,gg_beta_varpro) S3method(autoplot,gg_brier) S3method(autoplot,gg_error) @@ -13,6 +14,7 @@ S3method(autoplot,gg_partialpro) S3method(autoplot,gg_rfsrc) S3method(autoplot,gg_rhf) S3method(autoplot,gg_roc) +S3method(autoplot,gg_sdependent) S3method(autoplot,gg_survival) S3method(autoplot,gg_udependent) S3method(autoplot,gg_variable) @@ -21,6 +23,8 @@ S3method(autoplot,gg_vimp) S3method(calc_roc,randomForest) S3method(calc_roc,rfsrc) S3method(gg_auct,rhf) +S3method(gg_beta_uvarpro,default) +S3method(gg_beta_uvarpro,uvarpro) S3method(gg_beta_varpro,varpro) S3method(gg_brier,rfsrc) S3method(gg_error,randomForest) @@ -34,6 +38,8 @@ S3method(gg_rhf,rhf) S3method(gg_roc,default) S3method(gg_roc,randomForest) S3method(gg_roc,rfsrc) +S3method(gg_sdependent,default) +S3method(gg_sdependent,uvarpro) S3method(gg_survival,default) S3method(gg_survival,rfsrc) S3method(gg_variable,randomForest) @@ -41,6 +47,7 @@ S3method(gg_variable,rfsrc) S3method(gg_vimp,randomForest) S3method(gg_vimp,rfsrc) S3method(plot,gg_auct) +S3method(plot,gg_beta_uvarpro) S3method(plot,gg_beta_varpro) S3method(plot,gg_brier) S3method(plot,gg_error) @@ -53,12 +60,14 @@ S3method(plot,gg_partialpro) S3method(plot,gg_rfsrc) S3method(plot,gg_rhf) S3method(plot,gg_roc) +S3method(plot,gg_sdependent) S3method(plot,gg_survival) S3method(plot,gg_udependent) S3method(plot,gg_variable) S3method(plot,gg_varpro) S3method(plot,gg_vimp) S3method(print,gg_auct) +S3method(print,gg_beta_uvarpro) S3method(print,gg_beta_varpro) S3method(print,gg_brier) S3method(print,gg_error) @@ -71,6 +80,7 @@ S3method(print,gg_partialpro) S3method(print,gg_rfsrc) S3method(print,gg_rhf) S3method(print,gg_roc) +S3method(print,gg_sdependent) S3method(print,gg_survival) S3method(print,gg_udependent) S3method(print,gg_variable) @@ -81,6 +91,7 @@ S3method(print,summary.gg_beta_varpro) S3method(print,summary.gg_ivarpro) S3method(print,summary.gg_udependent) S3method(summary,gg_auct) +S3method(summary,gg_beta_uvarpro) S3method(summary,gg_beta_varpro) S3method(summary,gg_brier) S3method(summary,gg_error) @@ -93,6 +104,7 @@ S3method(summary,gg_partialpro) S3method(summary,gg_rfsrc) S3method(summary,gg_rhf) S3method(summary,gg_roc) +S3method(summary,gg_sdependent) S3method(summary,gg_survival) S3method(summary,gg_udependent) S3method(summary,gg_variable) @@ -101,6 +113,7 @@ S3method(summary,gg_vimp) export(calc_auc) export(calc_roc) export(gg_auct) +export(gg_beta_uvarpro) export(gg_beta_varpro) export(gg_brier) export(gg_error) @@ -113,6 +126,7 @@ export(gg_partialpro) export(gg_rfsrc) export(gg_rhf) export(gg_roc) +export(gg_sdependent) export(gg_survival) export(gg_udependent) export(gg_variable) @@ -139,6 +153,7 @@ importFrom(ggplot2,geom_jitter) importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) +importFrom(ggplot2,geom_segment) importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) diff --git a/NEWS.md b/NEWS.md index 9f91730d..79eda9f7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,11 @@ ggRandomForests v4.0.0 (development) ==================================== * Development version 3.1.2.9000, opened after the v3.1.2 CRAN release (forward-merged the v3.1.1 and v3.1.2 CRAN fixes onto the dev line). +* Begin the v4.0.0 development line: a Random Hazard Forests (RHF) + visualization layer wrapping the 'randomForestRHF' package (added to + Suggests). RHF support is gated — every gg_rhf* entry point checks + `requireNamespace("randomForestRHF")`. No change for users who do not + install it. * `gg_auct()` / `plot.gg_auct()`: tidy wrapper and plot for time-varying AUC from `randomForestRHF::auct.rhf()` (RHF Phase 2). Returns a long frame `time / auc / se / lower / upper / marker` with an `iauc` @@ -12,11 +17,31 @@ ggRandomForests v4.0.0 (development) AUC(t) with a bootstrap CI ribbon when available and a 0.5 reference line. `gg_auct.rhf(object, marker, auct_fit = NULL)` computes `auct.rhf()` internally or reuses a cached fit. -* Begin the v4.0.0 development line: a Random Hazard Forests (RHF) - visualization layer wrapping the 'randomForestRHF' package (added to - Suggests). RHF support is gated — every gg_rhf* entry point checks - `requireNamespace("randomForestRHF")`. No change for users who do not - install it. +* `gg_beta_uvarpro()` / `plot.gg_beta_uvarpro()`: tidy wrapper and bar chart + for `varPro::get.beta.entropy()` -- the unsupervised analogue of + `gg_beta_varpro()`. From a `uvarpro()` fit it aggregates the per-region + lasso coefficients into `beta_mean = colMeans(|beta|)` per variable + (most-important first), flags variables above a selection cutoff, and + accepts a precomputed `beta_fit` matrix. `print`/`summary`/`autoplot` + companions follow the `gg_*` conventions. +* `gg_sdependent()` / `plot.gg_sdependent()`: tidy wrapper and ranked + lollipop for `varPro::sdependent()` signal-variable detection. Returns one + row per candidate variable (`imp_score`, graph `degree`, `signal` flag) + ranked by `imp_score`. Complements `gg_udependent()` (the dependency + graph) with the "which variables are signal" ranking; shares the + `beta_fit` entropy matrix. Follows the `get.beta.entropy` + `sdependent` + workflow from the `varPro::uvarpro()` help (iowa-housing example). +* Fix (#118): `gg_varpro()` no longer fails with the cryptic + "arguments imply differing number of rows:
, 0" when + `varPro::importance()` returns a degenerate importance table (0 rows, or + `p` variables with no usable `z` column) -- observed intermittently on + survival fits where the release-rule step selects no variables. It now + stops with a clear, specific message explaining the empty importance and + suggesting a larger `ntree`. The guard is scoped to the degenerate case + only; well-formed fits (survival included) are unaffected -- this is not + a blanket survival-family block (cf. the reverted #116). +* Fixes the intro vignette's placeholder `\VignetteIndexEntry` + ("Vignette's Title" -> "Exploring Random Forests with ggRandomForests"). ggRandomForests v3.1.2 ====================== diff --git a/R/gg_beta_uvarpro.R b/R/gg_beta_uvarpro.R new file mode 100644 index 00000000..7090a3ff --- /dev/null +++ b/R/gg_beta_uvarpro.R @@ -0,0 +1,235 @@ +##============================================================================= +#' Per-variable lasso-beta importance from an unsupervised varPro fit +#' +#' Tidy wrapper around [varPro::get.beta.entropy()] for a `uvarpro` object. +#' Where [gg_beta_varpro()] refines the *supervised* release-rule contrast, +#' `gg_beta_uvarpro()` does the unsupervised analogue: `uvarpro()` builds +#' entropy regions with no response, and `get.beta.entropy()` fits a +#' cross-validated lasso within each region to ask how strongly every other +#' variable explains the released variable. Averaging the absolute lasso +#' coefficients per variable gives one number per variable: an unsupervised, +#' lasso-flavoured importance. +#' +#' @details +#' `get.beta.entropy(o)` returns a (released-variable x variable) numeric +#' matrix of absolute lasso coefficients. The column mean (`na.rm = TRUE`) is +#' the per-variable importance reported here, matching the canonical +#' `sort(colMeans(beta, na.rm = TRUE), decreasing = TRUE)` idiom in the +#' `varPro::uvarpro()` help ("iowa housing - illustrates lasso importance"). +#' +#' Because `get.beta.entropy()` is expensive (a cross-validated `glmnet` per +#' region), the `beta_fit` argument accepts a pre-computed matrix so you can +#' iterate on the cutoff without re-fitting. The pairing mirrors the +#' `beta_fit` argument of [gg_beta_varpro()]. +#' +#' @param object A `uvarpro` object from [varPro::uvarpro()]. +#' @param ... Forwarded to [varPro::get.beta.entropy()] when +#' `beta_fit = NULL` (e.g. `pre.filter`, `second.stage`, `use.cv`). +#' Ignored, with a warning, when `beta_fit` is supplied. +#' @param cutoff Selection threshold on `beta_mean`. `NULL` (default) uses +#' `mean(beta_mean)`; a scalar sets it explicitly. Variables at or above the +#' cutoff are flagged `selected`. +#' @param beta_fit Optional pre-computed [varPro::get.beta.entropy()] matrix +#' for `object`. When supplied, must be a numeric matrix with column names +#' (the variables); `...` is then ignored. +#' +#' @return A `gg_beta_uvarpro` object (a `data.frame`), one row per variable, +#' most-important first, with columns: +#' \describe{ +#' \item{`variable`}{factor; levels reversed so the most-important +#' variable lands at the top after `coord_flip()` (the `gg_vimp` +#' convention).} +#' \item{`beta_mean`}{`mean(|lasso beta|)` over the released regions +#' (`colMeans(beta, na.rm = TRUE)`).} +#' \item{`n_released`}{number of regions contributing a non-`NA` +#' coefficient for the variable.} +#' \item{`selected`}{logical; `beta_mean >= cutoff`.} +#' } +#' The `provenance` attribute records `source`, `family` (`"unsupv"`), +#' `cutoff`, `n_var`, `n_released_regions`, and `precomputed`. +#' +#' @seealso [gg_beta_varpro()] (supervised analogue), [gg_udependent()], +#' [varPro::get.beta.entropy()], [varPro::uvarpro()]. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("varPro", quietly = TRUE)) { +#' set.seed(1) +#' o <- varPro::uvarpro(mtcars, ntree = 50) +#' gg <- gg_beta_uvarpro(o) +#' plot(gg) +#' } +#' } +#' +#' @export +gg_beta_uvarpro <- function(object, ..., cutoff = NULL, beta_fit = NULL) { + UseMethod("gg_beta_uvarpro", object) +} + +#' @export +gg_beta_uvarpro.default <- function(object, ..., cutoff = NULL, + beta_fit = NULL) { + stop("gg_beta_uvarpro: expected a 'uvarpro' object from varPro::uvarpro(); ", + "got an object of class ", paste(class(object), collapse = "/"), ".", + call. = FALSE) +} + +#' @export +gg_beta_uvarpro.uvarpro <- function(object, ..., cutoff = NULL, + beta_fit = NULL) { + if (!inherits(object, "uvarpro")) { + stop("gg_beta_uvarpro: expected a 'uvarpro' object from varPro::uvarpro().", + call. = FALSE) + } + .assert_scalar_numeric_or_null(cutoff, "cutoff", "gg_beta_uvarpro") + + # Resolve the beta matrix (cache path) + if (is.null(beta_fit)) { + b <- varPro::get.beta.entropy(object, ...) + } else { + .validate_beta_uvarpro(beta_fit) + if (length(list(...)) > 0L) { + warning("gg_beta_uvarpro: arguments in '...' ignored because beta_fit is supplied.", + call. = FALSE) + } + b <- beta_fit + } + + # Empty fast-path: no regions / no variables survived + if (.is_empty_beta_matrix(b)) { + return(.gg_beta_uvarpro_empty(object, beta_fit, cutoff)) + } + + beta_mean_v <- colMeans(b, na.rm = TRUE) + n_released_v <- colSums(!is.na(b)) + + # Most-important first; reverse the factor levels so coord_flip() puts the + # top variable at the top (matches gg_vimp / gg_beta_varpro). + ord_names <- names(sort(beta_mean_v, decreasing = TRUE)) + + resolved_cutoff <- if (is.null(cutoff)) { + mean(beta_mean_v, na.rm = TRUE) + } else { + as.numeric(cutoff) + } + + out <- data.frame( + variable = factor(ord_names, levels = rev(ord_names)), + beta_mean = unname(beta_mean_v[ord_names]), + n_released = as.integer(unname(n_released_v[ord_names])), + stringsAsFactors = FALSE + ) + out$selected <- out$beta_mean >= resolved_cutoff + rownames(out) <- NULL + + class(out) <- c("gg_beta_uvarpro", "data.frame") + attr(out, "provenance") <- list( + source = "varPro::get.beta.entropy", + family = "unsupv", + ntree = if (!is.null(object$ntree)) as.integer(object$ntree) else NA_integer_, + cutoff = stats::setNames(resolved_cutoff, "unsupv"), + cutoff_default = is.null(cutoff), + n_var = ncol(b), + n_released_regions = nrow(b), + precomputed = !is.null(beta_fit), + xvar.names = colnames(b) + ) + out +} + +#' @noRd +.validate_beta_uvarpro <- function(beta_fit, caller = "gg_beta_uvarpro") { + if (!is.matrix(beta_fit) || !is.numeric(beta_fit)) { + stop(caller, ": beta_fit does not look like a ", + "varPro::get.beta.entropy() result. Expected a numeric matrix.", + call. = FALSE) + } + if (ncol(beta_fit) > 0L && is.null(colnames(beta_fit))) { + stop(caller, ": beta_fit must have column names (the variables). ", + "varPro::get.beta.entropy() returns a named matrix.", + call. = FALSE) + } + invisible(NULL) +} + +#' @noRd +.is_empty_beta_matrix <- function(m) { + is.null(m) || !is.matrix(m) || nrow(m) == 0L || ncol(m) == 0L +} + +#' @noRd +.assert_scalar_numeric_or_null <- function(x, arg, caller) { + if (!is.null(x) && + (!is.numeric(x) || length(x) != 1L || is.na(x))) { + stop(caller, ": `", arg, "` must be a single non-NA numeric value (or NULL).", + call. = FALSE) + } + invisible(NULL) +} + +#' @rdname print.gg +#' @export +print.gg_beta_uvarpro <- function(x, ...) { + prov <- attr(x, "provenance") + precomputed <- isTRUE(if (!is.null(prov)) prov$precomputed else FALSE) + n_regions <- if (!is.null(prov)) prov$n_released_regions %||% NA_integer_ else NA_integer_ + n_sel <- sum(x$selected, na.rm = TRUE) + cutoff <- if (!is.null(prov)) prov$cutoff %||% NA_real_ else NA_real_ + cutoff_val <- if (length(cutoff) >= 1L) cutoff[[1]] else NA_real_ + cutoff_default <- isTRUE(if (!is.null(prov)) prov$cutoff_default else FALSE) + cat(.gg_header(x, "gg_beta_uvarpro"), + sprintf(" | cutoff: %.4g%s", cutoff_val, + if (cutoff_default) " (default)" else ""), + sprintf(" | precomputed: %s", precomputed), + "\n", + sprintf(" %d of %d variables selected over %s released region(s)\n", + n_sel, nrow(x), + if (is.na(n_regions)) "NA" else format(n_regions)), + sep = "") + invisible(x) +} + +#' @rdname summary.gg +#' @export +summary.gg_beta_uvarpro <- function(object, ...) { + v <- sort(stats::setNames(object$beta_mean, as.character(object$variable)), + decreasing = TRUE) + top <- utils::head(v, 5L) + body <- c( + sprintf("variables: %d (selected: %d)", + nrow(object), sum(object$selected, na.rm = TRUE)), + "top variables by mean |lasso beta|:", + sprintf(" %-14s %.4g", names(top), unname(top)) + ) + .summary_skel(object, "gg_beta_uvarpro", body) +} + +#' @importFrom ggplot2 autoplot +#' @export +autoplot.gg_beta_uvarpro <- function(object, ...) { + plot.gg_beta_uvarpro(object, ...) +} + +#' @noRd +.gg_beta_uvarpro_empty <- function(object, beta_fit, cutoff) { + out <- data.frame( + variable = factor(character(0)), + beta_mean = numeric(0), + n_released = integer(0), + selected = logical(0), + stringsAsFactors = FALSE + ) + class(out) <- c("gg_beta_uvarpro", "data.frame") + attr(out, "provenance") <- list( + source = "varPro::get.beta.entropy", + family = "unsupv", + ntree = if (!is.null(object$ntree)) as.integer(object$ntree) else NA_integer_, + cutoff = stats::setNames(if (is.null(cutoff)) NA_real_ else as.numeric(cutoff), "unsupv"), + cutoff_default = is.null(cutoff), + n_var = 0L, + n_released_regions = 0L, + precomputed = !is.null(beta_fit), + xvar.names = character(0) + ) + out +} diff --git a/R/gg_sdependent.R b/R/gg_sdependent.R new file mode 100644 index 00000000..575fd9ec --- /dev/null +++ b/R/gg_sdependent.R @@ -0,0 +1,185 @@ +##============================================================================= +#' Signal-variable detection from an unsupervised varPro fit +#' +#' Tidy wrapper around [varPro::sdependent()] for a `uvarpro` object. Where +#' [gg_udependent()] draws the cross-variable dependency *graph*, +#' `gg_sdependent()` surfaces `sdependent()`'s *signal-variable detection*: a +#' ranked table of the per-variable signal score and graph degree, with the +#' variables flagged as "signal" (those whose dependency structure clears the +#' detection threshold). +#' +#' @details +#' `sdependent()` runs on the `varPro::get.beta.entropy()` lasso-coefficient +#' matrix and returns, with `plot = FALSE`, a list of `imp.score` (per-variable +#' signal score), `degree` (node degree in the dependency graph), and +#' `signal.vars` (the detected signal set). This wrapper tidies that into one +#' row per candidate variable, ranked by `imp.score`. Because the entropy +#' matrix is the expensive part, `beta_fit` accepts a precomputed +#' [varPro::get.beta.entropy()] matrix (shared with [gg_beta_uvarpro()] and +#' [gg_udependent()]). +#' +#' @param object A `uvarpro` object from [varPro::uvarpro()]. +#' @param ... Forwarded to [varPro::get.beta.entropy()] when `beta_fit = NULL`; +#' ignored, with a warning, when `beta_fit` is supplied. +#' @param threshold,q.signal,directed,min.degree Passed to +#' [varPro::sdependent()] (defaults match [gg_udependent()]). +#' @param beta_fit Optional precomputed [varPro::get.beta.entropy()] matrix. +#' +#' @return A `gg_sdependent` object (a `data.frame`), one row per candidate +#' variable, most-signal first, with columns: +#' \describe{ +#' \item{`variable`}{factor; levels reversed so the top variable lands at +#' the top after `coord_flip()`.} +#' \item{`imp_score`}{`sdependent()` per-variable signal score.} +#' \item{`degree`}{node degree in the dependency graph.} +#' \item{`signal`}{logical; variable is in `sdependent()$signal.vars`.} +#' } +#' The `provenance` attribute records `source`, `family` (`"unsupv"`), +#' `threshold`, `q.signal`, `directed`, `n_signal`, and `n_var`. +#' +#' @seealso [gg_udependent()] (the dependency graph), [gg_beta_uvarpro()] +#' (lasso importance), [varPro::sdependent()], [varPro::uvarpro()]. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("varPro", quietly = TRUE)) { +#' set.seed(1) +#' o <- varPro::uvarpro(mtcars, ntree = 50) +#' gg <- gg_sdependent(o) +#' plot(gg) +#' } +#' } +#' +#' @export +gg_sdependent <- function(object, ..., threshold = 0.25, q.signal = 0.75, + directed = TRUE, min.degree = NULL, beta_fit = NULL) { + UseMethod("gg_sdependent", object) +} + +#' @export +gg_sdependent.default <- function(object, ...) { + stop("gg_sdependent: expected a 'uvarpro' object from varPro::uvarpro(); ", + "got an object of class ", paste(class(object), collapse = "/"), ".", + call. = FALSE) +} + +#' @export +gg_sdependent.uvarpro <- function(object, ..., threshold = 0.25, + q.signal = 0.75, directed = TRUE, + min.degree = NULL, beta_fit = NULL) { + # Shared uvarpro-object + scalar threshold/directed validation (the same + # centralized check gg_udependent() uses). + .validate_udep_inputs(object, threshold, directed) + + if (is.null(beta_fit)) { + imp_mat <- varPro::get.beta.entropy(object, ...) + } else { + .validate_beta_uvarpro(beta_fit, caller = "gg_sdependent") + if (length(list(...)) > 0L) { + warning("gg_sdependent: arguments in '...' ignored because beta_fit is supplied.", + call. = FALSE) + } + imp_mat <- beta_fit + } + + prov <- list( + source = "varPro::sdependent", + family = "unsupv", + ntree = if (!is.null(object$ntree)) as.integer(object$ntree) else NA_integer_, + threshold = threshold, + q.signal = q.signal, + directed = isTRUE(directed), + precomputed = !is.null(beta_fit) + ) + + if (.is_empty_beta_matrix(imp_mat)) { + return(.gg_sdependent_empty(prov)) + } + .gg_sdependent_build(imp_mat, threshold, q.signal, directed, min.degree, prov) +} + +#' @noRd +.gg_sdependent_build <- function(imp_mat, threshold, q.signal, directed, + min.degree, prov) { + s <- varPro::sdependent(imp_mat, threshold = threshold, q.signal = q.signal, + directed = directed, min.degree = min.degree, + plot = FALSE) + + # sdependent() may return a character message when no graph is found. + if (is.character(s) || is.null(s$imp.score) || length(s$imp.score) == 0L) { + return(.gg_sdependent_empty(prov)) + } + + imp_score <- s$imp.score + vars <- names(imp_score) %||% as.character(seq_along(imp_score)) + degree <- if (!is.null(s$degree)) s$degree[vars] else rep(NA_real_, length(vars)) + signal_set <- s$signal.vars %||% character(0) + + ord <- order(imp_score, decreasing = TRUE) + ord_vars <- vars[ord] + + out <- data.frame( + variable = factor(ord_vars, levels = rev(ord_vars)), + imp_score = unname(imp_score[ord]), + degree = unname(as.numeric(degree)[ord]), + signal = ord_vars %in% signal_set, + stringsAsFactors = FALSE + ) + rownames(out) <- NULL + + class(out) <- c("gg_sdependent", "data.frame") + prov$n_var <- nrow(out) + prov$n_signal <- sum(out$signal) + attr(out, "provenance") <- prov + out +} + +#' @noRd +.gg_sdependent_empty <- function(prov) { + out <- data.frame( + variable = factor(character(0)), + imp_score = numeric(0), + degree = numeric(0), + signal = logical(0), + stringsAsFactors = FALSE + ) + class(out) <- c("gg_sdependent", "data.frame") + prov$n_var <- 0L + prov$n_signal <- 0L + attr(out, "provenance") <- prov + out +} + +#' @rdname print.gg +#' @export +print.gg_sdependent <- function(x, ...) { + prov <- attr(x, "provenance") + n_signal <- if (!is.null(prov)) prov$n_signal %||% sum(x$signal) else sum(x$signal) + thr <- if (!is.null(prov)) prov$threshold %||% NA_real_ else NA_real_ + cat(.gg_header(x, "gg_sdependent"), + sprintf(" | threshold: %.3g", thr), + sprintf(" | precomputed: %s", + isTRUE(if (!is.null(prov)) prov$precomputed else FALSE)), + "\n", + sprintf(" %d of %d variables flagged as signal\n", n_signal, nrow(x)), + sep = "") + invisible(x) +} + +#' @rdname summary.gg +#' @export +summary.gg_sdependent <- function(object, ...) { + sig <- as.character(object$variable[object$signal]) + body <- c( + sprintf("variables: %d (signal: %d)", nrow(object), sum(object$signal)), + sprintf("signal variables: %s", + if (length(sig)) paste(sig, collapse = ", ") else "(none)") + ) + .summary_skel(object, "gg_sdependent", body) +} + +#' @importFrom ggplot2 autoplot +#' @export +autoplot.gg_sdependent <- function(object, ...) { + plot.gg_sdependent(object, ...) +} diff --git a/R/gg_varpro.R b/R/gg_varpro.R index 170c82ee..4ab4c646 100644 --- a/R/gg_varpro.R +++ b/R/gg_varpro.R @@ -278,13 +278,33 @@ gg_varpro <- function(object, cond_mat <- NULL } + vars_raw <- rownames(imp_df_raw) + z_vec <- imp_df_raw$z + + ## Guard (issue #118): varPro::importance() occasionally returns a + ## degenerate importance table -- 0 rows, or a missing/empty `z` column + ## while p variables are named -- for some fits (observed on survival fits + ## where the release-rule step selects no variables). Detect that here and + ## fail with a clear, specific message instead of the cryptic + ## "arguments imply differing number of rows: p, 0" raised when the columns + ## below are recycled. Scoped to the degenerate case only: working fits + ## (survival included) have length(z_vec) == length(vars_raw) > 0 and are + ## unaffected -- this is NOT a blanket survival-family block (cf. #116). + if (length(vars_raw) == 0L || length(z_vec) != length(vars_raw)) { + stop("gg_varpro(): varPro::importance() returned no usable importance ", + "for this ", family, " fit -- the release-rule step selected no ", + "variables, so the importance table is empty and there is nothing ", + "to plot. This is most often seen on survival fits; refitting with ", + "more trees (a larger 'ntree') usually resolves it.", + call. = FALSE) + } + ## Replace NaN z with 0 for sorting/selection - z_vec <- imp_df_raw$z z_vec[!is.finite(z_vec)] <- 0 ## $imp: one row per variable imp_df <- data.frame( - variable = rownames(imp_df_raw), + variable = vars_raw, z = as.numeric(z_vec), selected = as.numeric(z_vec) > cutoff, stringsAsFactors = FALSE diff --git a/R/plot.gg_beta_uvarpro.R b/R/plot.gg_beta_uvarpro.R new file mode 100644 index 00000000..cd0caf9e --- /dev/null +++ b/R/plot.gg_beta_uvarpro.R @@ -0,0 +1,87 @@ +##============================================================================= +#' Plot a `gg_beta_uvarpro` object +#' +#' Horizontal bar chart of the mean absolute lasso coefficient +#' \eqn{\mathrm{mean}(|\hat{\beta}|)}{mean(|beta hat|)} per variable from an +#' unsupervised varPro fit, sorted descending so the eye lands on the top +#' variable first. Bars are filled blue above the selection cutoff, grey +#' otherwise, with a dashed red line at the cutoff. +#' +#' @section Reading the chart: +#' Each bar is the average magnitude of a per-region lasso coefficient for +#' that variable, computed by [varPro::get.beta.entropy()] over the +#' unsupervised entropy regions of a [varPro::uvarpro()] fit. There is no +#' response: the score measures how strongly a variable is reconstructed by +#' the others within released regions, i.e. an unsupervised +#' importance / redundancy signal rather than a predictive one. As with +#' [gg_beta_varpro()], the numeric scale carries the predictors' units, so +#' bar lengths are comparable within a data set but not blindly across +#' variables on very different scales. +#' +#' @param x A `gg_beta_uvarpro` object from [gg_beta_uvarpro()]. +#' @param ... Not currently used. +#' +#' @return A `ggplot` object. +#' +#' @seealso [gg_beta_uvarpro()], [gg_beta_varpro()], [gg_udependent()]. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("varPro", quietly = TRUE)) { +#' set.seed(1) +#' o <- varPro::uvarpro(mtcars, ntree = 50) +#' plot(gg_beta_uvarpro(o)) +#' } +#' } +#' +#' @name plot.gg_beta_uvarpro +#' @importFrom ggplot2 ggplot aes geom_col geom_hline coord_flip +#' @importFrom ggplot2 scale_fill_manual labs theme_minimal +#' @export +plot.gg_beta_uvarpro <- function(x, ...) { + if (nrow(x) == 0L) { + stop("plot.gg_beta_uvarpro: nothing to plot (gg_beta_uvarpro has 0 rows).", + call. = FALSE) + } + prov <- attr(x, "provenance") + cutoff <- if (!is.null(prov) && !is.null(prov$cutoff)) { + unname(prov$cutoff[[1]]) + } else { + mean(x$beta_mean) + } + n_var <- if (!is.null(prov)) prov$n_var %||% nrow(x) else nrow(x) + n_regions <- if (!is.null(prov)) prov$n_released_regions %||% NA_integer_ else NA_integer_ + + caption_txt <- sprintf( + "Mean |lasso beta| over %s released region(s), %s variable(s). Cutoff: %.4g.", + if (is.na(n_regions)) "NA" else format(n_regions), + format(n_var), cutoff + ) + + ggplot2::ggplot( + x, + ggplot2::aes( + x = .data[["variable"]], + y = .data[["beta_mean"]], + fill = factor(.data[["selected"]]) + ) + ) + + ggplot2::geom_col() + + ggplot2::coord_flip() + + ggplot2::scale_fill_manual( + values = c("TRUE" = "#4e8fcd", "FALSE" = "#888888"), + guide = "none" + ) + + ggplot2::geom_hline( + yintercept = cutoff, + linetype = "dashed", + color = "#e74c3c", + linewidth = 0.7 + ) + + ggplot2::labs( + x = NULL, + y = "Mean |lasso beta| (unsupervised, per-region)", + caption = caption_txt + ) + + ggplot2::theme_minimal() +} diff --git a/R/plot.gg_sdependent.R b/R/plot.gg_sdependent.R new file mode 100644 index 00000000..88843311 --- /dev/null +++ b/R/plot.gg_sdependent.R @@ -0,0 +1,76 @@ +##============================================================================= +#' Plot a `gg_sdependent` object +#' +#' Ranked lollipop of the per-variable `sdependent()` signal score, sorted +#' descending so the strongest signal lands at the top. Points are coloured +#' blue for variables flagged as signal (`signal == TRUE`) and grey otherwise. +#' +#' @section Reading the chart: +#' Each lollipop is a variable's signal score from [varPro::sdependent()], +#' computed on the unsupervised entropy-region lasso structure of a +#' [varPro::uvarpro()] fit. Blue variables cleared the detection threshold and +#' are reported in `sdependent()$signal.vars`; grey ones did not. Pair with +#' [gg_udependent()] (the dependency graph) to see *how* the signal variables +#' connect, and with [gg_beta_uvarpro()] for the lasso-importance ranking. +#' +#' @param x A `gg_sdependent` object from [gg_sdependent()]. +#' @param ... Not currently used. +#' +#' @return A `ggplot` object. +#' +#' @seealso [gg_sdependent()], [gg_udependent()], [gg_beta_uvarpro()]. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("varPro", quietly = TRUE)) { +#' set.seed(1) +#' o <- varPro::uvarpro(mtcars, ntree = 50) +#' plot(gg_sdependent(o)) +#' } +#' } +#' +#' @name plot.gg_sdependent +#' @importFrom ggplot2 ggplot aes geom_segment geom_point coord_flip +#' @importFrom ggplot2 scale_color_manual labs theme_minimal +#' @export +plot.gg_sdependent <- function(x, ...) { + if (nrow(x) == 0L) { + stop("plot.gg_sdependent: nothing to plot (gg_sdependent has 0 rows).", + call. = FALSE) + } + prov <- attr(x, "provenance") + thr <- if (!is.null(prov)) prov$threshold %||% NA_real_ else NA_real_ + n_signal <- if (!is.null(prov)) prov$n_signal %||% sum(x$signal) else sum(x$signal) + + caption_txt <- sprintf( + "varPro::sdependent() signal score. %d of %d flagged as signal (threshold %.3g).", + n_signal, nrow(x), thr + ) + + ggplot2::ggplot( + x, + ggplot2::aes( + x = .data[["variable"]], + y = .data[["imp_score"]], + color = factor(.data[["signal"]]) + ) + ) + + ggplot2::geom_segment( + ggplot2::aes( + xend = .data[["variable"]], + yend = 0 + ) + ) + + ggplot2::geom_point(size = 3) + + ggplot2::coord_flip() + + ggplot2::scale_color_manual( + values = c("TRUE" = "#4e8fcd", "FALSE" = "#888888"), + guide = "none" + ) + + ggplot2::labs( + x = NULL, + y = "Signal score (sdependent)", + caption = caption_txt + ) + + ggplot2::theme_minimal() +} diff --git a/_pkgdown.yml b/_pkgdown.yml index c46cfbb5..2272e340 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -69,8 +69,12 @@ reference: - plot.gg_varpro - gg_beta_varpro - plot.gg_beta_varpro + - gg_beta_uvarpro + - plot.gg_beta_uvarpro - gg_udependent - plot.gg_udependent + - gg_sdependent + - plot.gg_sdependent - gg_ivarpro - plot.gg_ivarpro diff --git a/man/gg_beta_uvarpro.Rd b/man/gg_beta_uvarpro.Rd new file mode 100644 index 00000000..6d66a6bf --- /dev/null +++ b/man/gg_beta_uvarpro.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gg_beta_uvarpro.R +\name{gg_beta_uvarpro} +\alias{gg_beta_uvarpro} +\title{Per-variable lasso-beta importance from an unsupervised varPro fit} +\usage{ +gg_beta_uvarpro(object, ..., cutoff = NULL, beta_fit = NULL) +} +\arguments{ +\item{object}{A \code{uvarpro} object from \code{\link[varPro:uvarpro]{varPro::uvarpro()}}.} + +\item{...}{Forwarded to \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} when +\code{beta_fit = NULL} (e.g. \code{pre.filter}, \code{second.stage}, \code{use.cv}). +Ignored, with a warning, when \code{beta_fit} is supplied.} + +\item{cutoff}{Selection threshold on \code{beta_mean}. \code{NULL} (default) uses +\code{mean(beta_mean)}; a scalar sets it explicitly. Variables at or above the +cutoff are flagged \code{selected}.} + +\item{beta_fit}{Optional pre-computed \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} matrix +for \code{object}. When supplied, must be a numeric matrix with column names +(the variables); \code{...} is then ignored.} +} +\value{ +A \code{gg_beta_uvarpro} object (a \code{data.frame}), one row per variable, +most-important first, with columns: +\describe{ +\item{\code{variable}}{factor; levels reversed so the most-important +variable lands at the top after \code{coord_flip()} (the \code{gg_vimp} +convention).} +\item{\code{beta_mean}}{\verb{mean(|lasso beta|)} over the released regions +(\code{colMeans(beta, na.rm = TRUE)}).} +\item{\code{n_released}}{number of regions contributing a non-\code{NA} +coefficient for the variable.} +\item{\code{selected}}{logical; \code{beta_mean >= cutoff}.} +} +The \code{provenance} attribute records \code{source}, \code{family} (\code{"unsupv"}), +\code{cutoff}, \code{n_var}, \code{n_released_regions}, and \code{precomputed}. +} +\description{ +Tidy wrapper around \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} for a \code{uvarpro} object. +Where \code{\link[=gg_beta_varpro]{gg_beta_varpro()}} refines the \emph{supervised} release-rule contrast, +\code{gg_beta_uvarpro()} does the unsupervised analogue: \code{uvarpro()} builds +entropy regions with no response, and \code{get.beta.entropy()} fits a +cross-validated lasso within each region to ask how strongly every other +variable explains the released variable. Averaging the absolute lasso +coefficients per variable gives one number per variable: an unsupervised, +lasso-flavoured importance. +} +\details{ +\code{get.beta.entropy(o)} returns a (released-variable x variable) numeric +matrix of absolute lasso coefficients. The column mean (\code{na.rm = TRUE}) is +the per-variable importance reported here, matching the canonical +\code{sort(colMeans(beta, na.rm = TRUE), decreasing = TRUE)} idiom in the +\code{varPro::uvarpro()} help ("iowa housing - illustrates lasso importance"). + +Because \code{get.beta.entropy()} is expensive (a cross-validated \code{glmnet} per +region), the \code{beta_fit} argument accepts a pre-computed matrix so you can +iterate on the cutoff without re-fitting. The pairing mirrors the +\code{beta_fit} argument of \code{\link[=gg_beta_varpro]{gg_beta_varpro()}}. +} +\examples{ +\donttest{ +if (requireNamespace("varPro", quietly = TRUE)) { + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 50) + gg <- gg_beta_uvarpro(o) + plot(gg) +} +} + +} +\seealso{ +\code{\link[=gg_beta_varpro]{gg_beta_varpro()}} (supervised analogue), \code{\link[=gg_udependent]{gg_udependent()}}, +\code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}}, \code{\link[varPro:uvarpro]{varPro::uvarpro()}}. +} diff --git a/man/gg_sdependent.Rd b/man/gg_sdependent.Rd new file mode 100644 index 00000000..7b88468a --- /dev/null +++ b/man/gg_sdependent.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gg_sdependent.R +\name{gg_sdependent} +\alias{gg_sdependent} +\title{Signal-variable detection from an unsupervised varPro fit} +\usage{ +gg_sdependent( + object, + ..., + threshold = 0.25, + q.signal = 0.75, + directed = TRUE, + min.degree = NULL, + beta_fit = NULL +) +} +\arguments{ +\item{object}{A \code{uvarpro} object from \code{\link[varPro:uvarpro]{varPro::uvarpro()}}.} + +\item{...}{Forwarded to \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} when \code{beta_fit = NULL}; +ignored, with a warning, when \code{beta_fit} is supplied.} + +\item{threshold, q.signal, directed, min.degree}{Passed to +\code{\link[varPro:sdependent]{varPro::sdependent()}} (defaults match \code{\link[=gg_udependent]{gg_udependent()}}).} + +\item{beta_fit}{Optional precomputed \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} matrix.} +} +\value{ +A \code{gg_sdependent} object (a \code{data.frame}), one row per candidate +variable, most-signal first, with columns: +\describe{ +\item{\code{variable}}{factor; levels reversed so the top variable lands at +the top after \code{coord_flip()}.} +\item{\code{imp_score}}{\code{sdependent()} per-variable signal score.} +\item{\code{degree}}{node degree in the dependency graph.} +\item{\code{signal}}{logical; variable is in \code{sdependent()$signal.vars}.} +} +The \code{provenance} attribute records \code{source}, \code{family} (\code{"unsupv"}), +\code{threshold}, \code{q.signal}, \code{directed}, \code{n_signal}, and \code{n_var}. +} +\description{ +Tidy wrapper around \code{\link[varPro:sdependent]{varPro::sdependent()}} for a \code{uvarpro} object. Where +\code{\link[=gg_udependent]{gg_udependent()}} draws the cross-variable dependency \emph{graph}, +\code{gg_sdependent()} surfaces \code{sdependent()}'s \emph{signal-variable detection}: a +ranked table of the per-variable signal score and graph degree, with the +variables flagged as "signal" (those whose dependency structure clears the +detection threshold). +} +\details{ +\code{sdependent()} runs on the \code{varPro::get.beta.entropy()} lasso-coefficient +matrix and returns, with \code{plot = FALSE}, a list of \code{imp.score} (per-variable +signal score), \code{degree} (node degree in the dependency graph), and +\code{signal.vars} (the detected signal set). This wrapper tidies that into one +row per candidate variable, ranked by \code{imp.score}. Because the entropy +matrix is the expensive part, \code{beta_fit} accepts a precomputed +\code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} matrix (shared with \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}} and +\code{\link[=gg_udependent]{gg_udependent()}}). +} +\examples{ +\donttest{ +if (requireNamespace("varPro", quietly = TRUE)) { + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 50) + gg <- gg_sdependent(o) + plot(gg) +} +} + +} +\seealso{ +\code{\link[=gg_udependent]{gg_udependent()}} (the dependency graph), \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}} +(lasso importance), \code{\link[varPro:sdependent]{varPro::sdependent()}}, \code{\link[varPro:uvarpro]{varPro::uvarpro()}}. +} diff --git a/man/plot.gg_beta_uvarpro.Rd b/man/plot.gg_beta_uvarpro.Rd new file mode 100644 index 00000000..0cc9abb9 --- /dev/null +++ b/man/plot.gg_beta_uvarpro.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.gg_beta_uvarpro.R +\name{plot.gg_beta_uvarpro} +\alias{plot.gg_beta_uvarpro} +\title{Plot a \code{gg_beta_uvarpro} object} +\usage{ +\method{plot}{gg_beta_uvarpro}(x, ...) +} +\arguments{ +\item{x}{A \code{gg_beta_uvarpro} object from \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}}.} + +\item{...}{Not currently used.} +} +\value{ +A \code{ggplot} object. +} +\description{ +Horizontal bar chart of the mean absolute lasso coefficient +\eqn{\mathrm{mean}(|\hat{\beta}|)}{mean(|beta hat|)} per variable from an +unsupervised varPro fit, sorted descending so the eye lands on the top +variable first. Bars are filled blue above the selection cutoff, grey +otherwise, with a dashed red line at the cutoff. +} +\section{Reading the chart}{ + +Each bar is the average magnitude of a per-region lasso coefficient for +that variable, computed by \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} over the +unsupervised entropy regions of a \code{\link[varPro:uvarpro]{varPro::uvarpro()}} fit. There is no +response: the score measures how strongly a variable is reconstructed by +the others within released regions, i.e. an unsupervised +importance / redundancy signal rather than a predictive one. As with +\code{\link[=gg_beta_varpro]{gg_beta_varpro()}}, the numeric scale carries the predictors' units, so +bar lengths are comparable within a data set but not blindly across +variables on very different scales. +} + +\examples{ +\donttest{ +if (requireNamespace("varPro", quietly = TRUE)) { + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 50) + plot(gg_beta_uvarpro(o)) +} +} + +} +\seealso{ +\code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}}, \code{\link[=gg_beta_varpro]{gg_beta_varpro()}}, \code{\link[=gg_udependent]{gg_udependent()}}. +} diff --git a/man/plot.gg_sdependent.Rd b/man/plot.gg_sdependent.Rd new file mode 100644 index 00000000..3164d938 --- /dev/null +++ b/man/plot.gg_sdependent.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.gg_sdependent.R +\name{plot.gg_sdependent} +\alias{plot.gg_sdependent} +\title{Plot a \code{gg_sdependent} object} +\usage{ +\method{plot}{gg_sdependent}(x, ...) +} +\arguments{ +\item{x}{A \code{gg_sdependent} object from \code{\link[=gg_sdependent]{gg_sdependent()}}.} + +\item{...}{Not currently used.} +} +\value{ +A \code{ggplot} object. +} +\description{ +Ranked lollipop of the per-variable \code{sdependent()} signal score, sorted +descending so the strongest signal lands at the top. Points are coloured +blue for variables flagged as signal (\code{signal == TRUE}) and grey otherwise. +} +\section{Reading the chart}{ + +Each lollipop is a variable's signal score from \code{\link[varPro:sdependent]{varPro::sdependent()}}, +computed on the unsupervised entropy-region lasso structure of a +\code{\link[varPro:uvarpro]{varPro::uvarpro()}} fit. Blue variables cleared the detection threshold and +are reported in \code{sdependent()$signal.vars}; grey ones did not. Pair with +\code{\link[=gg_udependent]{gg_udependent()}} (the dependency graph) to see \emph{how} the signal variables +connect, and with \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}} for the lasso-importance ranking. +} + +\examples{ +\donttest{ +if (requireNamespace("varPro", quietly = TRUE)) { + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 50) + plot(gg_sdependent(o)) +} +} + +} +\seealso{ +\code{\link[=gg_sdependent]{gg_sdependent()}}, \code{\link[=gg_udependent]{gg_udependent()}}, \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}}. +} diff --git a/man/print.gg.Rd b/man/print.gg.Rd index 8ead2e09..7587f282 100644 --- a/man/print.gg.Rd +++ b/man/print.gg.Rd @@ -1,6 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/print_methods.R -\name{print.gg} +% Please edit documentation in R/gg_beta_uvarpro.R, R/gg_sdependent.R, +% R/print_methods.R +\name{print.gg_beta_uvarpro} +\alias{print.gg_beta_uvarpro} +\alias{print.gg_sdependent} \alias{print.gg} \alias{print.gg_error} \alias{print.gg_vimp} @@ -23,6 +26,10 @@ \alias{print.gg_ivarpro} \title{Print methods for gg_* data objects} \usage{ +\method{print}{gg_beta_uvarpro}(x, ...) + +\method{print}{gg_sdependent}(x, ...) + \method{print}{gg_error}(x, ...) \method{print}{gg_vimp}(x, ...) diff --git a/man/summary.gg.Rd b/man/summary.gg.Rd index dab619a8..d9bf59ea 100644 --- a/man/summary.gg.Rd +++ b/man/summary.gg.Rd @@ -1,6 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summary_methods.R -\name{summary.gg} +% Please edit documentation in R/gg_beta_uvarpro.R, R/gg_sdependent.R, +% R/summary_methods.R +\name{summary.gg_beta_uvarpro} +\alias{summary.gg_beta_uvarpro} +\alias{summary.gg_sdependent} \alias{summary.gg} \alias{print.summary.gg} \alias{summary.gg_error} @@ -23,6 +26,10 @@ \alias{summary.gg_auct} \title{Summary methods for gg_* data objects} \usage{ +\method{summary}{gg_beta_uvarpro}(object, ...) + +\method{summary}{gg_sdependent}(object, ...) + \method{print}{summary.gg}(x, ...) \method{summary}{gg_error}(object, ...) @@ -62,11 +69,11 @@ \method{summary}{gg_auct}(object, ...) } \arguments{ -\item{x}{A \code{summary.gg} object (for \code{print.summary.gg}).} +\item{object}{A \code{gg_*} data object.} \item{...}{Not currently used.} -\item{object}{A \code{gg_*} data object.} +\item{x}{A \code{summary.gg} object (for \code{print.summary.gg}).} } \value{ A \code{summary.gg} object: a list with \code{header} and diff --git a/tests/testthat/test_gg_beta_uvarpro.R b/tests/testthat/test_gg_beta_uvarpro.R new file mode 100644 index 00000000..bbdc30b9 --- /dev/null +++ b/tests/testthat/test_gg_beta_uvarpro.R @@ -0,0 +1,160 @@ +# Tests for gg_beta_uvarpro() — the unsupervised (uvarpro) lasso-beta wrapper. +# +# The deterministic logic is tested through the `beta_fit` (precomputed) +# path with a hand-built matrix + a stub uvarpro object, so these run on CRAN +# with no live varPro grow (which would reach the randomForestSRC rule-grow +# path). One live integration test is skip_on_cran(). + +# A get.beta.entropy()-shaped matrix: rows = released variables, cols = all +# variables, cells = |lasso beta|. colMeans(na.rm=TRUE) is the importance. +.mock_beta_entropy <- function() { + m <- matrix( + c(NA, 0.2, 0.8, 0.1, + 0.5, NA, 0.6, 0.3, + 0.4, 0.4, NA, 0.2), + nrow = 3, byrow = TRUE, + dimnames = list(c("a", "b", "c"), c("a", "b", "c", "d")) + ) + m +} +.stub_uvarpro <- function(ntree = 50L) { + structure(list(ntree = ntree), class = "uvarpro") +} + +test_that("gg_beta_uvarpro returns the expected tidy shape", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + expect_s3_class(out, "gg_beta_uvarpro") + expect_s3_class(out, "data.frame") + expect_named(out, c("variable", "beta_mean", "n_released", "selected")) + expect_s3_class(out$variable, "factor") + expect_type(out$beta_mean, "double") + expect_type(out$n_released, "integer") + expect_type(out$selected, "logical") + expect_equal(nrow(out), 4L) +}) + +test_that("beta_mean equals colMeans(|beta|) per variable", { + m <- .mock_beta_entropy() + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m) + expected <- colMeans(m, na.rm = TRUE) + # align by variable name (out is sorted descending) + got <- stats::setNames(out$beta_mean, as.character(out$variable)) + expect_equal(got[names(expected)], expected[names(expected)]) +}) + +test_that("rows are most-important first; factor levels reversed for coord_flip", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + # row order strictly non-increasing in beta_mean + expect_false(is.unsorted(rev(out$beta_mean))) + # the most-important variable is the LAST factor level (top after flip) + top <- as.character(out$variable[which.max(out$beta_mean)]) + expect_equal(tail(levels(out$variable), 1), top) +}) + +test_that("n_released counts non-NA contributing regions", { + m <- .mock_beta_entropy() + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m) + got <- stats::setNames(out$n_released, as.character(out$variable)) + expect_equal(unname(got["a"]), sum(!is.na(m[, "a"]))) + expect_equal(unname(got["b"]), sum(!is.na(m[, "b"]))) +}) + +test_that("cutoff = 0 selects everything, Inf selects nothing; default is mean", { + m <- .mock_beta_entropy() + expect_true(all(gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m, cutoff = 0)$selected)) + expect_false(any(gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m, cutoff = Inf)$selected)) + + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m) + prov <- attr(out, "provenance") + expect_equal(unname(prov$cutoff[["unsupv"]]), mean(colMeans(m, na.rm = TRUE))) + expect_true(prov$cutoff_default) + expect_false(prov$precomputed == FALSE) # beta_fit supplied -> precomputed TRUE +}) + +test_that("provenance records the unsupervised source and dimensions", { + m <- .mock_beta_entropy() + prov <- attr(gg_beta_uvarpro(.stub_uvarpro(77L), beta_fit = m), "provenance") + expect_equal(prov$source, "varPro::get.beta.entropy") + expect_equal(prov$family, "unsupv") + expect_equal(prov$ntree, 77L) + expect_equal(prov$n_var, ncol(m)) + expect_equal(prov$n_released_regions, nrow(m)) +}) + +test_that("gg_beta_uvarpro warns when ... is supplied alongside beta_fit", { + expect_warning( + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy(), + pre.filter = FALSE), + "ignored because beta_fit is supplied" + ) + expect_s3_class(out, "gg_beta_uvarpro") +}) + +test_that("malformed beta_fit is rejected with a clear message", { + expect_error( + gg_beta_uvarpro(.stub_uvarpro(), beta_fit = list()), + "does not look like a varPro::get.beta.entropy" + ) + unnamed <- matrix(1:6, nrow = 2) + expect_error( + gg_beta_uvarpro(.stub_uvarpro(), beta_fit = unnamed), + "must have column names" + ) +}) + +test_that("empty beta matrix yields an empty gg_beta_uvarpro frame", { + empty <- matrix(numeric(0), nrow = 0, ncol = 0) + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = empty) + expect_s3_class(out, "gg_beta_uvarpro") + expect_equal(nrow(out), 0L) + expect_named(out, c("variable", "beta_mean", "n_released", "selected")) + expect_error(plot(out), "nothing to plot") +}) + +test_that("plot returns a ggplot", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + p <- plot(out) + expect_s3_class(p, "ggplot") +}) + +test_that("print returns invisibly and shows a header line", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + expect_output(print(out), "gg_beta_uvarpro") + expect_output(print(out), "variables selected") + expect_identical(withVisible(print(out))$visible, FALSE) +}) + +test_that("summary returns a summary.gg object with the top variables", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + s <- summary(out) + expect_s3_class(s, "summary.gg_beta_uvarpro") + expect_s3_class(s, "summary.gg") + expect_true(any(grepl("top variables", s$body))) +}) + +test_that("autoplot dispatches to plot and returns a ggplot", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + expect_s3_class(ggplot2::autoplot(out), "ggplot") +}) + +test_that("non-uvarpro input is rejected", { + expect_error( + gg_beta_uvarpro(structure(list(), class = "varpro")), + "expected a 'uvarpro' object" + ) +}) + +# ---- live integration (skipped on CRAN: uvarpro grows a forest) ----------- +test_that("gg_beta_uvarpro agrees with a live get.beta.entropy() fit", { + skip_on_cran() + skip_if_not_installed("varPro") + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 30) + b <- varPro::get.beta.entropy(o) + cached <- gg_beta_uvarpro(o, beta_fit = b) + expect_s3_class(cached, "gg_beta_uvarpro") + expect_equal(nrow(cached), ncol(b)) + got <- stats::setNames(cached$beta_mean, as.character(cached$variable)) + exp <- colMeans(b, na.rm = TRUE) + expect_equal(got[names(exp)], exp[names(exp)]) +}) diff --git a/tests/testthat/test_gg_sdependent.R b/tests/testthat/test_gg_sdependent.R new file mode 100644 index 00000000..495d7d23 --- /dev/null +++ b/tests/testthat/test_gg_sdependent.R @@ -0,0 +1,110 @@ +# Tests for gg_sdependent() — the sdependent() signal-detection wrapper. +# +# gg_sdependent() calls varPro::sdependent() on a get.beta.entropy() matrix. +# sdependent() does NOT grow a forest (no UBSAN path), so the matrix-driven +# tests run on CRAN with skip_if_not_installed("varPro"); only the live +# uvarpro() grow is skip_on_cran(). + +# A get.beta.entropy()-shaped matrix: square, named, NA diagonal. +.mock_entropy_sq <- function() { + v <- c("a", "b", "c", "d") + matrix(c(NA, .6, .1, .7, + .5, NA, .2, .8, + .1, .3, NA, .2, + .6, .7, .1, NA), + nrow = 4, byrow = TRUE, dimnames = list(v, v)) +} +.stub_uvarpro <- function(ntree = 50L) { + structure(list(ntree = ntree), class = "uvarpro") +} + +test_that("gg_sdependent returns the expected tidy shape", { + skip_if_not_installed("varPro") + out <- gg_sdependent(.stub_uvarpro(), beta_fit = .mock_entropy_sq()) + expect_s3_class(out, "gg_sdependent") + expect_named(out, c("variable", "imp_score", "degree", "signal")) + expect_s3_class(out$variable, "factor") + expect_type(out$imp_score, "double") + expect_type(out$signal, "logical") +}) + +test_that("rows are ranked by imp_score; top is last factor level", { + skip_if_not_installed("varPro") + out <- gg_sdependent(.stub_uvarpro(), beta_fit = .mock_entropy_sq()) + expect_false(is.unsorted(rev(out$imp_score))) + top <- as.character(out$variable[which.max(out$imp_score)]) + expect_equal(tail(levels(out$variable), 1), top) +}) + +test_that("signal flag matches sdependent()$signal.vars", { + skip_if_not_installed("varPro") + m <- .mock_entropy_sq() + s <- varPro::sdependent(m, plot = FALSE) + out <- gg_sdependent(.stub_uvarpro(), beta_fit = m) + flagged <- as.character(out$variable[out$signal]) + expect_setequal(flagged, s$signal.vars) +}) + +test_that("provenance records the sdependent source and parameters", { + skip_if_not_installed("varPro") + prov <- attr(gg_sdependent(.stub_uvarpro(60L), beta_fit = .mock_entropy_sq(), + threshold = 0.3), + "provenance") + expect_equal(prov$source, "varPro::sdependent") + expect_equal(prov$family, "unsupv") + expect_equal(prov$ntree, 60L) + expect_equal(prov$threshold, 0.3) + expect_true(prov$precomputed) +}) + +test_that("... is ignored (with a warning) when beta_fit is supplied", { + skip_if_not_installed("varPro") + expect_warning( + gg_sdependent(.stub_uvarpro(), beta_fit = .mock_entropy_sq(), + pre.filter = FALSE), + "ignored because beta_fit is supplied" + ) +}) + +test_that("malformed beta_fit and non-uvarpro input are rejected", { + expect_error( + gg_sdependent(.stub_uvarpro(), beta_fit = list()), + "does not look like a varPro::get.beta.entropy" + ) + expect_error( + gg_sdependent(structure(list(), class = "varpro")), + "expected a 'uvarpro' object" + ) +}) + +test_that("empty beta matrix yields an empty gg_sdependent frame", { + out <- gg_sdependent(.stub_uvarpro(), beta_fit = matrix(numeric(0), 0, 0)) + expect_s3_class(out, "gg_sdependent") + expect_equal(nrow(out), 0L) + expect_named(out, c("variable", "imp_score", "degree", "signal")) + expect_error(plot(out), "nothing to plot") +}) + +test_that("print / summary / autoplot companions work", { + skip_if_not_installed("varPro") + out <- gg_sdependent(.stub_uvarpro(), beta_fit = .mock_entropy_sq()) + expect_output(print(out), "gg_sdependent") + expect_output(print(out), "flagged as signal") + s <- summary(out) + expect_s3_class(s, "summary.gg_sdependent") + expect_true(any(grepl("signal variables", s$body))) + expect_s3_class(ggplot2::autoplot(out), "ggplot") +}) + +# ---- live integration (skipped on CRAN: uvarpro grows a forest) ----------- +test_that("gg_sdependent agrees with a live uvarpro + sdependent fit", { + skip_on_cran() + skip_if_not_installed("varPro") + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 30) + out <- gg_sdependent(o) + expect_s3_class(out, "gg_sdependent") + b <- varPro::get.beta.entropy(o) + s <- varPro::sdependent(b, plot = FALSE) + expect_setequal(as.character(out$variable[out$signal]), s$signal.vars) +}) diff --git a/tests/testthat/test_gg_varpro_empty_importance.R b/tests/testthat/test_gg_varpro_empty_importance.R new file mode 100644 index 00000000..d11cf484 --- /dev/null +++ b/tests/testthat/test_gg_varpro_empty_importance.R @@ -0,0 +1,48 @@ +# Issue #118: gg_varpro() on some survival fits failed with the cryptic +# "arguments imply differing number of rows:
, 0" when varPro::importance() +# returns a degenerate importance table (0 rows, or p rows with no usable `z` +# column). The guard in .build_varpro_imp_dfs() must turn that into a clear, +# specific message — scoped to the degenerate case, NOT a blanket survival +# block (cf. the reverted #116). +# +# These exercise the internal directly with a constructed importance table, so +# they run on CRAN with no varPro grow. + +.dummy_tree <- function() matrix(0, 1, 1, dimnames = list("t1", "x")) + +test_that("0-row importance table fails with the #118 message, not a cryptic one", { + imp0 <- data.frame(mean = numeric(0), std = numeric(0), z = numeric(0)) + err <- tryCatch( + ggRandomForests:::.build_varpro_imp_dfs( + imp0, .dummy_tree(), "surv", 0.79, NULL, FALSE, TRUE, FALSE), + error = function(e) conditionMessage(e)) + expect_match(err, "no usable importance") + expect_match(err, "surv") + expect_false(grepl("differing number of rows", err)) +}) + +test_that("importance with p rows but no z column fails clearly (the real #118 shape)", { + # p named variables, columns present in working fits EXCEPT `z` + imp_noz <- data.frame(mean = c(1, 2, 3), std = c(.1, .2, .3), + row.names = c("age", "bili", "albumin")) + err <- tryCatch( + ggRandomForests:::.build_varpro_imp_dfs( + imp_noz, .dummy_tree(), "surv", 0.79, NULL, FALSE, TRUE, FALSE), + error = function(e) conditionMessage(e)) + expect_match(err, "no usable importance") + expect_false(grepl("differing number of rows", err)) +}) + +test_that("a well-formed importance table is NOT blocked (working path preserved)", { + # 3 variables with a real z column + a matching per-tree matrix + imp_ok <- data.frame(mean = c(1, 2, 3), std = c(1, 1, 1), z = c(0.5, 1.5, 2.5), + row.names = c("age", "bili", "albumin")) + tree <- matrix(rnorm(9), nrow = 3, + dimnames = list(c("t1", "t2", "t3"), + c("age", "bili", "albumin"))) + dfs <- ggRandomForests:::.build_varpro_imp_dfs( + imp_ok, tree, "surv", 0.79, NULL, FALSE, FALSE, FALSE) + expect_named(dfs, c("imp", "imp_tree", "stats", "conditional")) + expect_equal(nrow(dfs$imp), 3L) + expect_setequal(as.character(dfs$imp$variable), c("age", "bili", "albumin")) +}) diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 00000000..8f6f2bd7 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,7 @@ +/.quarto/ +**/*.quarto_ipynb + +# Quarto/Pandoc render outputs (regenerated on build; .Rbuildignore'd) +*_files/ +*_cache/ +*.html diff --git a/vignettes/ggRandomForests.qmd b/vignettes/ggRandomForests.qmd index cfac0415..b51121c9 100644 --- a/vignettes/ggRandomForests.qmd +++ b/vignettes/ggRandomForests.qmd @@ -12,7 +12,7 @@ editor: markdown: wrap: 80 vignette: > - %\VignetteIndexEntry{Vignette's Title} + %\VignetteIndexEntry{Exploring Random Forests with ggRandomForests} %\VignetteEngine{quarto::html} %\VignetteEncoding{UTF-8} ---