From 051fcf45d6a27de89eb96fb0dd55818a79ec3149 Mon Sep 17 00:00:00 2001 From: John Ehrlinger Date: Fri, 12 Jun 2026 09:55:14 -0400 Subject: [PATCH 1/3] feat(varpro): add gg_beta_uvarpro() for unsupervised lasso importance Scaffold the unsupervised analogue of gg_beta_varpro(): a tidy wrapper + plot method around varPro::get.beta.entropy() for uvarpro() fits. From the (released-variable x variable) lasso-coefficient matrix it computes beta_mean = colMeans(|beta|) per variable (most-important first, factor levels reversed for the coord_flip top-at-top convention) and flags variables above a selection cutoff (default mean(beta_mean)). Optional beta_fit accepts a precomputed get.beta.entropy() matrix so the expensive CV-lasso runs once. Follows the get.beta.entropy + sdependent "lasso importance" workflow from the varPro::uvarpro() help (iowa-housing example), per Lu/Ishwaran. Tests: deterministic logic via the precomputed beta_fit path with a mock matrix (CRAN-safe, no live grow) + one skip_on_cran() live integration test. check_man() clean. Co-Authored-By: Claude Opus 4.8 --- NAMESPACE | 4 + NEWS.md | 11 ++ R/gg_beta_uvarpro.R | 176 ++++++++++++++++++++++++++ R/plot.gg_beta_uvarpro.R | 87 +++++++++++++ man/gg_beta_uvarpro.Rd | 76 +++++++++++ man/plot.gg_beta_uvarpro.Rd | 49 +++++++ tests/testthat/test_gg_beta_uvarpro.R | 140 ++++++++++++++++++++ 7 files changed, 543 insertions(+) create mode 100644 R/gg_beta_uvarpro.R create mode 100644 R/plot.gg_beta_uvarpro.R create mode 100644 man/gg_beta_uvarpro.Rd create mode 100644 man/plot.gg_beta_uvarpro.Rd create mode 100644 tests/testthat/test_gg_beta_uvarpro.R diff --git a/NAMESPACE b/NAMESPACE index e69c1d08..0b0b469c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,8 @@ S3method(autoplot,gg_vimp) S3method(calc_roc,randomForest) S3method(calc_roc,rfsrc) S3method(gg_auct,rhf) +S3method(gg_beta_uvarpro,default) +S3method(gg_beta_uvarpro,uvarpro) S3method(gg_beta_varpro,varpro) S3method(gg_brier,rfsrc) S3method(gg_error,randomForest) @@ -41,6 +43,7 @@ S3method(gg_variable,rfsrc) S3method(gg_vimp,randomForest) S3method(gg_vimp,rfsrc) S3method(plot,gg_auct) +S3method(plot,gg_beta_uvarpro) S3method(plot,gg_beta_varpro) S3method(plot,gg_brier) S3method(plot,gg_error) @@ -101,6 +104,7 @@ S3method(summary,gg_vimp) export(calc_auc) export(calc_roc) export(gg_auct) +export(gg_beta_uvarpro) export(gg_beta_varpro) export(gg_brier) export(gg_error) diff --git a/NEWS.md b/NEWS.md index a3d1ae38..b21ada46 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,17 @@ Version: 3.1.0.9000 ggRandomForests v4.0.0 (development) ==================================== * Development version 3.1.0.9000, opened after the v3.1.0 CRAN release. +* `gg_beta_uvarpro()` / `plot.gg_beta_uvarpro()`: tidy wrapper and default + horizontal bar chart for `varPro::get.beta.entropy()` — the unsupervised + analogue of `gg_beta_varpro()`. From a `uvarpro()` fit it aggregates the + per-region lasso coefficients into `beta_mean = colMeans(|beta|)` (one row + per variable, most-important first) and flags variables above a selection + cutoff (default `mean(beta_mean)`). An optional `beta_fit` argument accepts + a precomputed `get.beta.entropy()` matrix so the expensive cross-validated + lasso runs once. Provenance records `source`, `family = "unsupv"`, + `cutoff`, `n_var`, and `n_released_regions`. Follows the `get.beta.entropy` + + `sdependent` "lasso importance" workflow from the `varPro::uvarpro()` + help (iowa-housing example). * `gg_auct()` / `plot.gg_auct()`: tidy wrapper and plot for time-varying AUC from `randomForestRHF::auct.rhf()` (RHF Phase 2). Returns a long frame `time / auc / se / lower / upper / marker` with an `iauc` diff --git a/R/gg_beta_uvarpro.R b/R/gg_beta_uvarpro.R new file mode 100644 index 00000000..9e448243 --- /dev/null +++ b/R/gg_beta_uvarpro.R @@ -0,0 +1,176 @@ +##============================================================================= +#' Per-variable lasso-beta importance from an unsupervised varPro fit +#' +#' Tidy wrapper around [varPro::get.beta.entropy()] for a `uvarpro` object. +#' Where [gg_beta_varpro()] refines the *supervised* release-rule contrast, +#' `gg_beta_uvarpro()` does the unsupervised analogue: `uvarpro()` builds +#' entropy regions with no response, and `get.beta.entropy()` fits a +#' cross-validated lasso within each region to ask how strongly every other +#' variable explains the released variable. Averaging the absolute lasso +#' coefficients per variable gives one number per variable: an unsupervised, +#' lasso-flavoured importance. +#' +#' @details +#' `get.beta.entropy(o)` returns a (released-variable x variable) numeric +#' matrix of absolute lasso coefficients. The column mean (`na.rm = TRUE`) is +#' the per-variable importance reported here, matching the canonical +#' `sort(colMeans(beta, na.rm = TRUE), decreasing = TRUE)` idiom in the +#' `varPro::uvarpro()` help ("iowa housing - illustrates lasso importance"). +#' +#' Because `get.beta.entropy()` is expensive (a cross-validated `glmnet` per +#' region), the `beta_fit` argument accepts a pre-computed matrix so you can +#' iterate on the cutoff without re-fitting. The pairing mirrors the +#' `beta_fit` argument of [gg_beta_varpro()]. +#' +#' @param object A `uvarpro` object from [varPro::uvarpro()]. +#' @param ... Forwarded to [varPro::get.beta.entropy()] when +#' `beta_fit = NULL` (e.g. `pre.filter`, `second.stage`, `use.cv`). +#' Ignored, with a warning, when `beta_fit` is supplied. +#' @param cutoff Selection threshold on `beta_mean`. `NULL` (default) uses +#' `mean(beta_mean)`; a scalar sets it explicitly. Variables at or above the +#' cutoff are flagged `selected`. +#' @param beta_fit Optional pre-computed [varPro::get.beta.entropy()] matrix +#' for `object`. When supplied, must be a numeric matrix with column names +#' (the variables); `...` is then ignored. +#' +#' @return A `gg_beta_uvarpro` object (a `data.frame`), one row per variable, +#' most-important first, with columns: +#' \describe{ +#' \item{`variable`}{factor; levels reversed so the most-important +#' variable lands at the top after `coord_flip()` (the `gg_vimp` +#' convention).} +#' \item{`beta_mean`}{`mean(|lasso beta|)` over the released regions +#' (`colMeans(beta, na.rm = TRUE)`).} +#' \item{`n_released`}{number of regions contributing a non-`NA` +#' coefficient for the variable.} +#' \item{`selected`}{logical; `beta_mean >= cutoff`.} +#' } +#' The `provenance` attribute records `source`, `family` (`"unsupv"`), +#' `cutoff`, `n_var`, `n_released_regions`, and `precomputed`. +#' +#' @seealso [gg_beta_varpro()] (supervised analogue), [gg_udependent()], +#' [varPro::get.beta.entropy()], [varPro::uvarpro()]. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("varPro", quietly = TRUE)) { +#' set.seed(1) +#' o <- varPro::uvarpro(mtcars, ntree = 50) +#' gg <- gg_beta_uvarpro(o) +#' plot(gg) +#' } +#' } +#' +#' @export +gg_beta_uvarpro <- function(object, ..., cutoff = NULL, beta_fit = NULL) { + UseMethod("gg_beta_uvarpro", object) +} + +#' @export +gg_beta_uvarpro.default <- function(object, ..., cutoff = NULL, + beta_fit = NULL) { + stop("gg_beta_uvarpro: expected a 'uvarpro' object from varPro::uvarpro(); ", + "got an object of class ", paste(class(object), collapse = "/"), ".", + call. = FALSE) +} + +#' @export +gg_beta_uvarpro.uvarpro <- function(object, ..., cutoff = NULL, + beta_fit = NULL) { + if (!inherits(object, "uvarpro")) { + stop("gg_beta_uvarpro: expected a 'uvarpro' object from varPro::uvarpro().", + call. = FALSE) + } + + # Resolve the beta matrix (cache path) + if (is.null(beta_fit)) { + b <- varPro::get.beta.entropy(object, ...) + } else { + .validate_beta_uvarpro(beta_fit) + if (length(list(...)) > 0L) { + warning("gg_beta_uvarpro: arguments in '...' ignored because beta_fit is supplied.", + call. = FALSE) + } + b <- beta_fit + } + + # Empty fast-path: no regions / no variables survived + if (is.null(b) || !is.matrix(b) || nrow(b) == 0L || ncol(b) == 0L) { + return(.gg_beta_uvarpro_empty(object, beta_fit, cutoff)) + } + + beta_mean_v <- colMeans(b, na.rm = TRUE) + n_released_v <- colSums(!is.na(b)) + + # Most-important first; reverse the factor levels so coord_flip() puts the + # top variable at the top (matches gg_vimp / gg_beta_varpro). + ord_names <- names(sort(beta_mean_v, decreasing = TRUE)) + + resolved_cutoff <- if (is.null(cutoff)) { + mean(beta_mean_v, na.rm = TRUE) + } else { + as.numeric(cutoff) + } + + out <- data.frame( + variable = factor(ord_names, levels = rev(ord_names)), + beta_mean = unname(beta_mean_v[ord_names]), + n_released = as.integer(unname(n_released_v[ord_names])), + stringsAsFactors = FALSE + ) + out$selected <- out$beta_mean >= resolved_cutoff + rownames(out) <- NULL + + class(out) <- c("gg_beta_uvarpro", "data.frame") + attr(out, "provenance") <- list( + source = "varPro::get.beta.entropy", + family = "unsupv", + ntree = if (!is.null(object$ntree)) as.integer(object$ntree) else NA_integer_, + cutoff = stats::setNames(resolved_cutoff, "unsupv"), + cutoff_default = is.null(cutoff), + n_var = ncol(b), + n_released_regions = nrow(b), + precomputed = !is.null(beta_fit), + xvar.names = colnames(b) + ) + out +} + +#' @noRd +.validate_beta_uvarpro <- function(beta_fit) { + if (!is.matrix(beta_fit) || !is.numeric(beta_fit)) { + stop("gg_beta_uvarpro: beta_fit does not look like a ", + "varPro::get.beta.entropy() result. Expected a numeric matrix.", + call. = FALSE) + } + if (ncol(beta_fit) > 0L && is.null(colnames(beta_fit))) { + stop("gg_beta_uvarpro: beta_fit must have column names (the variables). ", + "varPro::get.beta.entropy() returns a named matrix.", + call. = FALSE) + } + invisible(NULL) +} + +#' @noRd +.gg_beta_uvarpro_empty <- function(object, beta_fit, cutoff) { + out <- data.frame( + variable = factor(character(0)), + beta_mean = numeric(0), + n_released = integer(0), + selected = logical(0), + stringsAsFactors = FALSE + ) + class(out) <- c("gg_beta_uvarpro", "data.frame") + attr(out, "provenance") <- list( + source = "varPro::get.beta.entropy", + family = "unsupv", + ntree = if (!is.null(object$ntree)) as.integer(object$ntree) else NA_integer_, + cutoff = stats::setNames(if (is.null(cutoff)) NA_real_ else as.numeric(cutoff), "unsupv"), + cutoff_default = is.null(cutoff), + n_var = 0L, + n_released_regions = 0L, + precomputed = !is.null(beta_fit), + xvar.names = character(0) + ) + out +} diff --git a/R/plot.gg_beta_uvarpro.R b/R/plot.gg_beta_uvarpro.R new file mode 100644 index 00000000..cd0caf9e --- /dev/null +++ b/R/plot.gg_beta_uvarpro.R @@ -0,0 +1,87 @@ +##============================================================================= +#' Plot a `gg_beta_uvarpro` object +#' +#' Horizontal bar chart of the mean absolute lasso coefficient +#' \eqn{\mathrm{mean}(|\hat{\beta}|)}{mean(|beta hat|)} per variable from an +#' unsupervised varPro fit, sorted descending so the eye lands on the top +#' variable first. Bars are filled blue above the selection cutoff, grey +#' otherwise, with a dashed red line at the cutoff. +#' +#' @section Reading the chart: +#' Each bar is the average magnitude of a per-region lasso coefficient for +#' that variable, computed by [varPro::get.beta.entropy()] over the +#' unsupervised entropy regions of a [varPro::uvarpro()] fit. There is no +#' response: the score measures how strongly a variable is reconstructed by +#' the others within released regions, i.e. an unsupervised +#' importance / redundancy signal rather than a predictive one. As with +#' [gg_beta_varpro()], the numeric scale carries the predictors' units, so +#' bar lengths are comparable within a data set but not blindly across +#' variables on very different scales. +#' +#' @param x A `gg_beta_uvarpro` object from [gg_beta_uvarpro()]. +#' @param ... Not currently used. +#' +#' @return A `ggplot` object. +#' +#' @seealso [gg_beta_uvarpro()], [gg_beta_varpro()], [gg_udependent()]. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("varPro", quietly = TRUE)) { +#' set.seed(1) +#' o <- varPro::uvarpro(mtcars, ntree = 50) +#' plot(gg_beta_uvarpro(o)) +#' } +#' } +#' +#' @name plot.gg_beta_uvarpro +#' @importFrom ggplot2 ggplot aes geom_col geom_hline coord_flip +#' @importFrom ggplot2 scale_fill_manual labs theme_minimal +#' @export +plot.gg_beta_uvarpro <- function(x, ...) { + if (nrow(x) == 0L) { + stop("plot.gg_beta_uvarpro: nothing to plot (gg_beta_uvarpro has 0 rows).", + call. = FALSE) + } + prov <- attr(x, "provenance") + cutoff <- if (!is.null(prov) && !is.null(prov$cutoff)) { + unname(prov$cutoff[[1]]) + } else { + mean(x$beta_mean) + } + n_var <- if (!is.null(prov)) prov$n_var %||% nrow(x) else nrow(x) + n_regions <- if (!is.null(prov)) prov$n_released_regions %||% NA_integer_ else NA_integer_ + + caption_txt <- sprintf( + "Mean |lasso beta| over %s released region(s), %s variable(s). Cutoff: %.4g.", + if (is.na(n_regions)) "NA" else format(n_regions), + format(n_var), cutoff + ) + + ggplot2::ggplot( + x, + ggplot2::aes( + x = .data[["variable"]], + y = .data[["beta_mean"]], + fill = factor(.data[["selected"]]) + ) + ) + + ggplot2::geom_col() + + ggplot2::coord_flip() + + ggplot2::scale_fill_manual( + values = c("TRUE" = "#4e8fcd", "FALSE" = "#888888"), + guide = "none" + ) + + ggplot2::geom_hline( + yintercept = cutoff, + linetype = "dashed", + color = "#e74c3c", + linewidth = 0.7 + ) + + ggplot2::labs( + x = NULL, + y = "Mean |lasso beta| (unsupervised, per-region)", + caption = caption_txt + ) + + ggplot2::theme_minimal() +} diff --git a/man/gg_beta_uvarpro.Rd b/man/gg_beta_uvarpro.Rd new file mode 100644 index 00000000..6d66a6bf --- /dev/null +++ b/man/gg_beta_uvarpro.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gg_beta_uvarpro.R +\name{gg_beta_uvarpro} +\alias{gg_beta_uvarpro} +\title{Per-variable lasso-beta importance from an unsupervised varPro fit} +\usage{ +gg_beta_uvarpro(object, ..., cutoff = NULL, beta_fit = NULL) +} +\arguments{ +\item{object}{A \code{uvarpro} object from \code{\link[varPro:uvarpro]{varPro::uvarpro()}}.} + +\item{...}{Forwarded to \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} when +\code{beta_fit = NULL} (e.g. \code{pre.filter}, \code{second.stage}, \code{use.cv}). +Ignored, with a warning, when \code{beta_fit} is supplied.} + +\item{cutoff}{Selection threshold on \code{beta_mean}. \code{NULL} (default) uses +\code{mean(beta_mean)}; a scalar sets it explicitly. Variables at or above the +cutoff are flagged \code{selected}.} + +\item{beta_fit}{Optional pre-computed \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} matrix +for \code{object}. When supplied, must be a numeric matrix with column names +(the variables); \code{...} is then ignored.} +} +\value{ +A \code{gg_beta_uvarpro} object (a \code{data.frame}), one row per variable, +most-important first, with columns: +\describe{ +\item{\code{variable}}{factor; levels reversed so the most-important +variable lands at the top after \code{coord_flip()} (the \code{gg_vimp} +convention).} +\item{\code{beta_mean}}{\verb{mean(|lasso beta|)} over the released regions +(\code{colMeans(beta, na.rm = TRUE)}).} +\item{\code{n_released}}{number of regions contributing a non-\code{NA} +coefficient for the variable.} +\item{\code{selected}}{logical; \code{beta_mean >= cutoff}.} +} +The \code{provenance} attribute records \code{source}, \code{family} (\code{"unsupv"}), +\code{cutoff}, \code{n_var}, \code{n_released_regions}, and \code{precomputed}. +} +\description{ +Tidy wrapper around \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} for a \code{uvarpro} object. +Where \code{\link[=gg_beta_varpro]{gg_beta_varpro()}} refines the \emph{supervised} release-rule contrast, +\code{gg_beta_uvarpro()} does the unsupervised analogue: \code{uvarpro()} builds +entropy regions with no response, and \code{get.beta.entropy()} fits a +cross-validated lasso within each region to ask how strongly every other +variable explains the released variable. Averaging the absolute lasso +coefficients per variable gives one number per variable: an unsupervised, +lasso-flavoured importance. +} +\details{ +\code{get.beta.entropy(o)} returns a (released-variable x variable) numeric +matrix of absolute lasso coefficients. The column mean (\code{na.rm = TRUE}) is +the per-variable importance reported here, matching the canonical +\code{sort(colMeans(beta, na.rm = TRUE), decreasing = TRUE)} idiom in the +\code{varPro::uvarpro()} help ("iowa housing - illustrates lasso importance"). + +Because \code{get.beta.entropy()} is expensive (a cross-validated \code{glmnet} per +region), the \code{beta_fit} argument accepts a pre-computed matrix so you can +iterate on the cutoff without re-fitting. The pairing mirrors the +\code{beta_fit} argument of \code{\link[=gg_beta_varpro]{gg_beta_varpro()}}. +} +\examples{ +\donttest{ +if (requireNamespace("varPro", quietly = TRUE)) { + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 50) + gg <- gg_beta_uvarpro(o) + plot(gg) +} +} + +} +\seealso{ +\code{\link[=gg_beta_varpro]{gg_beta_varpro()}} (supervised analogue), \code{\link[=gg_udependent]{gg_udependent()}}, +\code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}}, \code{\link[varPro:uvarpro]{varPro::uvarpro()}}. +} diff --git a/man/plot.gg_beta_uvarpro.Rd b/man/plot.gg_beta_uvarpro.Rd new file mode 100644 index 00000000..0cc9abb9 --- /dev/null +++ b/man/plot.gg_beta_uvarpro.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.gg_beta_uvarpro.R +\name{plot.gg_beta_uvarpro} +\alias{plot.gg_beta_uvarpro} +\title{Plot a \code{gg_beta_uvarpro} object} +\usage{ +\method{plot}{gg_beta_uvarpro}(x, ...) +} +\arguments{ +\item{x}{A \code{gg_beta_uvarpro} object from \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}}.} + +\item{...}{Not currently used.} +} +\value{ +A \code{ggplot} object. +} +\description{ +Horizontal bar chart of the mean absolute lasso coefficient +\eqn{\mathrm{mean}(|\hat{\beta}|)}{mean(|beta hat|)} per variable from an +unsupervised varPro fit, sorted descending so the eye lands on the top +variable first. Bars are filled blue above the selection cutoff, grey +otherwise, with a dashed red line at the cutoff. +} +\section{Reading the chart}{ + +Each bar is the average magnitude of a per-region lasso coefficient for +that variable, computed by \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} over the +unsupervised entropy regions of a \code{\link[varPro:uvarpro]{varPro::uvarpro()}} fit. There is no +response: the score measures how strongly a variable is reconstructed by +the others within released regions, i.e. an unsupervised +importance / redundancy signal rather than a predictive one. As with +\code{\link[=gg_beta_varpro]{gg_beta_varpro()}}, the numeric scale carries the predictors' units, so +bar lengths are comparable within a data set but not blindly across +variables on very different scales. +} + +\examples{ +\donttest{ +if (requireNamespace("varPro", quietly = TRUE)) { + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 50) + plot(gg_beta_uvarpro(o)) +} +} + +} +\seealso{ +\code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}}, \code{\link[=gg_beta_varpro]{gg_beta_varpro()}}, \code{\link[=gg_udependent]{gg_udependent()}}. +} diff --git a/tests/testthat/test_gg_beta_uvarpro.R b/tests/testthat/test_gg_beta_uvarpro.R new file mode 100644 index 00000000..b5b7f914 --- /dev/null +++ b/tests/testthat/test_gg_beta_uvarpro.R @@ -0,0 +1,140 @@ +# Tests for gg_beta_uvarpro() — the unsupervised (uvarpro) lasso-beta wrapper. +# +# The deterministic logic is tested through the `beta_fit` (precomputed) +# path with a hand-built matrix + a stub uvarpro object, so these run on CRAN +# with no live varPro grow (which would reach the randomForestSRC rule-grow +# path). One live integration test is skip_on_cran(). + +# A get.beta.entropy()-shaped matrix: rows = released variables, cols = all +# variables, cells = |lasso beta|. colMeans(na.rm=TRUE) is the importance. +.mock_beta_entropy <- function() { + m <- matrix( + c(NA, 0.2, 0.8, 0.1, + 0.5, NA, 0.6, 0.3, + 0.4, 0.4, NA, 0.2), + nrow = 3, byrow = TRUE, + dimnames = list(c("a", "b", "c"), c("a", "b", "c", "d")) + ) + m +} +.stub_uvarpro <- function(ntree = 50L) { + structure(list(ntree = ntree), class = "uvarpro") +} + +test_that("gg_beta_uvarpro returns the expected tidy shape", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + expect_s3_class(out, "gg_beta_uvarpro") + expect_s3_class(out, "data.frame") + expect_named(out, c("variable", "beta_mean", "n_released", "selected")) + expect_s3_class(out$variable, "factor") + expect_type(out$beta_mean, "double") + expect_type(out$n_released, "integer") + expect_type(out$selected, "logical") + expect_equal(nrow(out), 4L) +}) + +test_that("beta_mean equals colMeans(|beta|) per variable", { + m <- .mock_beta_entropy() + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m) + expected <- colMeans(m, na.rm = TRUE) + # align by variable name (out is sorted descending) + got <- stats::setNames(out$beta_mean, as.character(out$variable)) + expect_equal(got[names(expected)], expected[names(expected)]) +}) + +test_that("rows are most-important first; factor levels reversed for coord_flip", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + # row order strictly non-increasing in beta_mean + expect_false(is.unsorted(rev(out$beta_mean))) + # the most-important variable is the LAST factor level (top after flip) + top <- as.character(out$variable[which.max(out$beta_mean)]) + expect_equal(tail(levels(out$variable), 1), top) +}) + +test_that("n_released counts non-NA contributing regions", { + m <- .mock_beta_entropy() + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m) + got <- stats::setNames(out$n_released, as.character(out$variable)) + expect_equal(unname(got["a"]), sum(!is.na(m[, "a"]))) + expect_equal(unname(got["b"]), sum(!is.na(m[, "b"]))) +}) + +test_that("cutoff = 0 selects everything, Inf selects nothing; default is mean", { + m <- .mock_beta_entropy() + expect_true(all(gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m, cutoff = 0)$selected)) + expect_false(any(gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m, cutoff = Inf)$selected)) + + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = m) + prov <- attr(out, "provenance") + expect_equal(unname(prov$cutoff[["unsupv"]]), mean(colMeans(m, na.rm = TRUE))) + expect_true(prov$cutoff_default) + expect_false(prov$precomputed == FALSE) # beta_fit supplied -> precomputed TRUE +}) + +test_that("provenance records the unsupervised source and dimensions", { + m <- .mock_beta_entropy() + prov <- attr(gg_beta_uvarpro(.stub_uvarpro(77L), beta_fit = m), "provenance") + expect_equal(prov$source, "varPro::get.beta.entropy") + expect_equal(prov$family, "unsupv") + expect_equal(prov$ntree, 77L) + expect_equal(prov$n_var, ncol(m)) + expect_equal(prov$n_released_regions, nrow(m)) +}) + +test_that("gg_beta_uvarpro warns when ... is supplied alongside beta_fit", { + expect_warning( + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy(), + pre.filter = FALSE), + "ignored because beta_fit is supplied" + ) + expect_s3_class(out, "gg_beta_uvarpro") +}) + +test_that("malformed beta_fit is rejected with a clear message", { + expect_error( + gg_beta_uvarpro(.stub_uvarpro(), beta_fit = list()), + "does not look like a varPro::get.beta.entropy" + ) + unnamed <- matrix(1:6, nrow = 2) + expect_error( + gg_beta_uvarpro(.stub_uvarpro(), beta_fit = unnamed), + "must have column names" + ) +}) + +test_that("empty beta matrix yields an empty gg_beta_uvarpro frame", { + empty <- matrix(numeric(0), nrow = 0, ncol = 0) + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = empty) + expect_s3_class(out, "gg_beta_uvarpro") + expect_equal(nrow(out), 0L) + expect_named(out, c("variable", "beta_mean", "n_released", "selected")) + expect_error(plot(out), "nothing to plot") +}) + +test_that("plot returns a ggplot", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + p <- plot(out) + expect_s3_class(p, "ggplot") +}) + +test_that("non-uvarpro input is rejected", { + expect_error( + gg_beta_uvarpro(structure(list(), class = "varpro")), + "expected a 'uvarpro' object" + ) +}) + +# ---- live integration (skipped on CRAN: uvarpro grows a forest) ----------- +test_that("gg_beta_uvarpro agrees with a live get.beta.entropy() fit", { + skip_on_cran() + skip_if_not_installed("varPro") + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 30) + b <- varPro::get.beta.entropy(o) + cached <- gg_beta_uvarpro(o, beta_fit = b) + expect_s3_class(cached, "gg_beta_uvarpro") + expect_equal(nrow(cached), ncol(b)) + got <- stats::setNames(cached$beta_mean, as.character(cached$variable)) + exp <- colMeans(b, na.rm = TRUE) + expect_equal(got[names(exp)], exp[names(exp)]) +}) From a6f0cb05489110d9a831a0114c8df36e3032699b Mon Sep 17 00:00:00 2001 From: John Ehrlinger Date: Fri, 12 Jun 2026 10:00:28 -0400 Subject: [PATCH 2/3] feat(varpro): print/summary/autoplot companions for gg_beta_uvarpro Add the standard gg_* S3 companions, matching gg_beta_varpro: - print.gg_beta_uvarpro: one-line header + selected/region summary - summary.gg_beta_uvarpro: summary.gg with top variables by mean |beta| - autoplot.gg_beta_uvarpro: dispatches to plot.gg_beta_uvarpro Methods live in gg_beta_uvarpro.R; @rdname routes them onto the shared print.gg / summary.gg pages. Tests + check_man clean. Co-Authored-By: Claude Opus 4.8 --- NAMESPACE | 3 ++ R/gg_beta_uvarpro.R | 43 +++++++++++++++++++++++++++ man/print.gg.Rd | 7 +++-- man/summary.gg.Rd | 11 ++++--- tests/testthat/test_gg_beta_uvarpro.R | 20 +++++++++++++ 5 files changed, 78 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 0b0b469c..7e8bad20 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand S3method(autoplot,gg_auct) +S3method(autoplot,gg_beta_uvarpro) S3method(autoplot,gg_beta_varpro) S3method(autoplot,gg_brier) S3method(autoplot,gg_error) @@ -62,6 +63,7 @@ S3method(plot,gg_variable) S3method(plot,gg_varpro) S3method(plot,gg_vimp) S3method(print,gg_auct) +S3method(print,gg_beta_uvarpro) S3method(print,gg_beta_varpro) S3method(print,gg_brier) S3method(print,gg_error) @@ -84,6 +86,7 @@ S3method(print,summary.gg_beta_varpro) S3method(print,summary.gg_ivarpro) S3method(print,summary.gg_udependent) S3method(summary,gg_auct) +S3method(summary,gg_beta_uvarpro) S3method(summary,gg_beta_varpro) S3method(summary,gg_brier) S3method(summary,gg_error) diff --git a/R/gg_beta_uvarpro.R b/R/gg_beta_uvarpro.R index 9e448243..ea89f195 100644 --- a/R/gg_beta_uvarpro.R +++ b/R/gg_beta_uvarpro.R @@ -151,6 +151,49 @@ gg_beta_uvarpro.uvarpro <- function(object, ..., cutoff = NULL, invisible(NULL) } +#' @rdname print.gg +#' @export +print.gg_beta_uvarpro <- function(x, ...) { + prov <- attr(x, "provenance") + precomputed <- isTRUE(if (!is.null(prov)) prov$precomputed else FALSE) + n_regions <- if (!is.null(prov)) prov$n_released_regions %||% NA_integer_ else NA_integer_ + n_sel <- sum(x$selected, na.rm = TRUE) + cutoff <- if (!is.null(prov)) prov$cutoff %||% NA_real_ else NA_real_ + cutoff_val <- if (length(cutoff) >= 1L) cutoff[[1]] else NA_real_ + cutoff_default <- isTRUE(if (!is.null(prov)) prov$cutoff_default else FALSE) + cat(.gg_header(x, "gg_beta_uvarpro"), + sprintf(" | cutoff: %.4g%s", cutoff_val, + if (cutoff_default) " (default)" else ""), + sprintf(" | precomputed: %s", precomputed), + "\n", + sprintf(" %d of %d variables selected over %s released region(s)\n", + n_sel, nrow(x), + if (is.na(n_regions)) "NA" else format(n_regions)), + sep = "") + invisible(x) +} + +#' @rdname summary.gg +#' @export +summary.gg_beta_uvarpro <- function(object, ...) { + v <- sort(stats::setNames(object$beta_mean, as.character(object$variable)), + decreasing = TRUE) + top <- utils::head(v, 5L) + body <- c( + sprintf("variables: %d (selected: %d)", + nrow(object), sum(object$selected, na.rm = TRUE)), + "top variables by mean |lasso beta|:", + sprintf(" %-14s %.4g", names(top), unname(top)) + ) + .summary_skel(object, "gg_beta_uvarpro", body) +} + +#' @importFrom ggplot2 autoplot +#' @export +autoplot.gg_beta_uvarpro <- function(object, ...) { + plot.gg_beta_uvarpro(object, ...) +} + #' @noRd .gg_beta_uvarpro_empty <- function(object, beta_fit, cutoff) { out <- data.frame( diff --git a/man/print.gg.Rd b/man/print.gg.Rd index 8ead2e09..1af3adba 100644 --- a/man/print.gg.Rd +++ b/man/print.gg.Rd @@ -1,6 +1,7 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/print_methods.R -\name{print.gg} +% Please edit documentation in R/gg_beta_uvarpro.R, R/print_methods.R +\name{print.gg_beta_uvarpro} +\alias{print.gg_beta_uvarpro} \alias{print.gg} \alias{print.gg_error} \alias{print.gg_vimp} @@ -23,6 +24,8 @@ \alias{print.gg_ivarpro} \title{Print methods for gg_* data objects} \usage{ +\method{print}{gg_beta_uvarpro}(x, ...) + \method{print}{gg_error}(x, ...) \method{print}{gg_vimp}(x, ...) diff --git a/man/summary.gg.Rd b/man/summary.gg.Rd index dab619a8..d695478e 100644 --- a/man/summary.gg.Rd +++ b/man/summary.gg.Rd @@ -1,6 +1,7 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summary_methods.R -\name{summary.gg} +% Please edit documentation in R/gg_beta_uvarpro.R, R/summary_methods.R +\name{summary.gg_beta_uvarpro} +\alias{summary.gg_beta_uvarpro} \alias{summary.gg} \alias{print.summary.gg} \alias{summary.gg_error} @@ -23,6 +24,8 @@ \alias{summary.gg_auct} \title{Summary methods for gg_* data objects} \usage{ +\method{summary}{gg_beta_uvarpro}(object, ...) + \method{print}{summary.gg}(x, ...) \method{summary}{gg_error}(object, ...) @@ -62,11 +65,11 @@ \method{summary}{gg_auct}(object, ...) } \arguments{ -\item{x}{A \code{summary.gg} object (for \code{print.summary.gg}).} +\item{object}{A \code{gg_*} data object.} \item{...}{Not currently used.} -\item{object}{A \code{gg_*} data object.} +\item{x}{A \code{summary.gg} object (for \code{print.summary.gg}).} } \value{ A \code{summary.gg} object: a list with \code{header} and diff --git a/tests/testthat/test_gg_beta_uvarpro.R b/tests/testthat/test_gg_beta_uvarpro.R index b5b7f914..bbdc30b9 100644 --- a/tests/testthat/test_gg_beta_uvarpro.R +++ b/tests/testthat/test_gg_beta_uvarpro.R @@ -117,6 +117,26 @@ test_that("plot returns a ggplot", { expect_s3_class(p, "ggplot") }) +test_that("print returns invisibly and shows a header line", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + expect_output(print(out), "gg_beta_uvarpro") + expect_output(print(out), "variables selected") + expect_identical(withVisible(print(out))$visible, FALSE) +}) + +test_that("summary returns a summary.gg object with the top variables", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + s <- summary(out) + expect_s3_class(s, "summary.gg_beta_uvarpro") + expect_s3_class(s, "summary.gg") + expect_true(any(grepl("top variables", s$body))) +}) + +test_that("autoplot dispatches to plot and returns a ggplot", { + out <- gg_beta_uvarpro(.stub_uvarpro(), beta_fit = .mock_beta_entropy()) + expect_s3_class(ggplot2::autoplot(out), "ggplot") +}) + test_that("non-uvarpro input is rejected", { expect_error( gg_beta_uvarpro(structure(list(), class = "varpro")), From be6e150c0e5c907efa9f620243cc284be09e3192 Mon Sep 17 00:00:00 2001 From: John Ehrlinger Date: Fri, 12 Jun 2026 10:04:43 -0400 Subject: [PATCH 3/3] feat(varpro): add gg_sdependent() signal-detection view Wrap varPro::sdependent() (signal-variable detection on the get.beta.entropy() matrix) as a tidy gg_sdependent frame: one row per candidate variable with imp_score, graph degree, and a signal flag (membership in sdependent()$signal.vars), ranked by imp_score. Default plot is a ranked lollipop coloured by the signal flag. print/summary/ autoplot companions follow the gg_* conventions; beta_fit accepts a precomputed entropy matrix shared with gg_beta_uvarpro()/gg_udependent(). Complements gg_udependent() (the dependency graph) with the "which variables are signal" ranking. Matrix-driven tests run CRAN-safe (sdependent does not grow a forest); the live uvarpro() grow is skip_on_cran(). check_man() clean. Co-Authored-By: Claude Opus 4.8 --- NAMESPACE | 8 ++ NEWS.md | 8 ++ R/gg_sdependent.R | 183 ++++++++++++++++++++++++++++ R/plot.gg_sdependent.R | 76 ++++++++++++ man/gg_sdependent.Rd | 73 +++++++++++ man/plot.gg_sdependent.Rd | 44 +++++++ man/print.gg.Rd | 6 +- man/summary.gg.Rd | 6 +- tests/testthat/test_gg_sdependent.R | 110 +++++++++++++++++ 9 files changed, 512 insertions(+), 2 deletions(-) create mode 100644 R/gg_sdependent.R create mode 100644 R/plot.gg_sdependent.R create mode 100644 man/gg_sdependent.Rd create mode 100644 man/plot.gg_sdependent.Rd create mode 100644 tests/testthat/test_gg_sdependent.R diff --git a/NAMESPACE b/NAMESPACE index 7e8bad20..b4680c50 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ S3method(autoplot,gg_partialpro) S3method(autoplot,gg_rfsrc) S3method(autoplot,gg_rhf) S3method(autoplot,gg_roc) +S3method(autoplot,gg_sdependent) S3method(autoplot,gg_survival) S3method(autoplot,gg_udependent) S3method(autoplot,gg_variable) @@ -37,6 +38,8 @@ S3method(gg_rhf,rhf) S3method(gg_roc,default) S3method(gg_roc,randomForest) S3method(gg_roc,rfsrc) +S3method(gg_sdependent,default) +S3method(gg_sdependent,uvarpro) S3method(gg_survival,default) S3method(gg_survival,rfsrc) S3method(gg_variable,randomForest) @@ -57,6 +60,7 @@ S3method(plot,gg_partialpro) S3method(plot,gg_rfsrc) S3method(plot,gg_rhf) S3method(plot,gg_roc) +S3method(plot,gg_sdependent) S3method(plot,gg_survival) S3method(plot,gg_udependent) S3method(plot,gg_variable) @@ -76,6 +80,7 @@ S3method(print,gg_partialpro) S3method(print,gg_rfsrc) S3method(print,gg_rhf) S3method(print,gg_roc) +S3method(print,gg_sdependent) S3method(print,gg_survival) S3method(print,gg_udependent) S3method(print,gg_variable) @@ -99,6 +104,7 @@ S3method(summary,gg_partialpro) S3method(summary,gg_rfsrc) S3method(summary,gg_rhf) S3method(summary,gg_roc) +S3method(summary,gg_sdependent) S3method(summary,gg_survival) S3method(summary,gg_udependent) S3method(summary,gg_variable) @@ -120,6 +126,7 @@ export(gg_partialpro) export(gg_rfsrc) export(gg_rhf) export(gg_roc) +export(gg_sdependent) export(gg_survival) export(gg_udependent) export(gg_variable) @@ -146,6 +153,7 @@ importFrom(ggplot2,geom_jitter) importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) +importFrom(ggplot2,geom_segment) importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) diff --git a/NEWS.md b/NEWS.md index b21ada46..6ac9af79 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,14 @@ ggRandomForests v4.0.0 (development) `cutoff`, `n_var`, and `n_released_regions`. Follows the `get.beta.entropy` + `sdependent` "lasso importance" workflow from the `varPro::uvarpro()` help (iowa-housing example). +* `gg_sdependent()` / `plot.gg_sdependent()`: tidy wrapper and ranked + lollipop for `varPro::sdependent()`'s signal-variable detection. From a + `uvarpro()` fit (or a precomputed `beta_fit` entropy matrix) it returns one + row per candidate variable — `imp_score`, graph `degree`, and a `signal` + flag (membership in `sdependent()$signal.vars`) — ranked by `imp_score`. + Complements `gg_udependent()` (the dependency graph) with the "which + variables are signal" ranking. `print` / `summary` / `autoplot` companions + included. * `gg_auct()` / `plot.gg_auct()`: tidy wrapper and plot for time-varying AUC from `randomForestRHF::auct.rhf()` (RHF Phase 2). Returns a long frame `time / auc / se / lower / upper / marker` with an `iauc` diff --git a/R/gg_sdependent.R b/R/gg_sdependent.R new file mode 100644 index 00000000..dbc3cc9e --- /dev/null +++ b/R/gg_sdependent.R @@ -0,0 +1,183 @@ +##============================================================================= +#' Signal-variable detection from an unsupervised varPro fit +#' +#' Tidy wrapper around [varPro::sdependent()] for a `uvarpro` object. Where +#' [gg_udependent()] draws the cross-variable dependency *graph*, +#' `gg_sdependent()` surfaces `sdependent()`'s *signal-variable detection*: a +#' ranked table of the per-variable signal score and graph degree, with the +#' variables flagged as "signal" (those whose dependency structure clears the +#' detection threshold). +#' +#' @details +#' `sdependent()` runs on the `varPro::get.beta.entropy()` lasso-coefficient +#' matrix and returns, with `plot = FALSE`, a list of `imp.score` (per-variable +#' signal score), `degree` (node degree in the dependency graph), and +#' `signal.vars` (the detected signal set). This wrapper tidies that into one +#' row per candidate variable, ranked by `imp.score`. Because the entropy +#' matrix is the expensive part, `beta_fit` accepts a precomputed +#' [varPro::get.beta.entropy()] matrix (shared with [gg_beta_uvarpro()] and +#' [gg_udependent()]). +#' +#' @param object A `uvarpro` object from [varPro::uvarpro()]. +#' @param ... Forwarded to [varPro::get.beta.entropy()] when `beta_fit = NULL`; +#' ignored, with a warning, when `beta_fit` is supplied. +#' @param threshold,q.signal,directed,min.degree Passed to +#' [varPro::sdependent()] (defaults match [gg_udependent()]). +#' @param beta_fit Optional precomputed [varPro::get.beta.entropy()] matrix. +#' +#' @return A `gg_sdependent` object (a `data.frame`), one row per candidate +#' variable, most-signal first, with columns: +#' \describe{ +#' \item{`variable`}{factor; levels reversed so the top variable lands at +#' the top after `coord_flip()`.} +#' \item{`imp_score`}{`sdependent()` per-variable signal score.} +#' \item{`degree`}{node degree in the dependency graph.} +#' \item{`signal`}{logical; variable is in `sdependent()$signal.vars`.} +#' } +#' The `provenance` attribute records `source`, `family` (`"unsupv"`), +#' `threshold`, `q.signal`, `directed`, `n_signal`, and `n_var`. +#' +#' @seealso [gg_udependent()] (the dependency graph), [gg_beta_uvarpro()] +#' (lasso importance), [varPro::sdependent()], [varPro::uvarpro()]. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("varPro", quietly = TRUE)) { +#' set.seed(1) +#' o <- varPro::uvarpro(mtcars, ntree = 50) +#' gg <- gg_sdependent(o) +#' plot(gg) +#' } +#' } +#' +#' @export +gg_sdependent <- function(object, ..., threshold = 0.25, q.signal = 0.75, + directed = TRUE, min.degree = NULL, beta_fit = NULL) { + UseMethod("gg_sdependent", object) +} + +#' @export +gg_sdependent.default <- function(object, ...) { + stop("gg_sdependent: expected a 'uvarpro' object from varPro::uvarpro(); ", + "got an object of class ", paste(class(object), collapse = "/"), ".", + call. = FALSE) +} + +#' @export +gg_sdependent.uvarpro <- function(object, ..., threshold = 0.25, + q.signal = 0.75, directed = TRUE, + min.degree = NULL, beta_fit = NULL) { + if (!inherits(object, "uvarpro")) { + stop("gg_sdependent: expected a 'uvarpro' object from varPro::uvarpro().", + call. = FALSE) + } + + if (is.null(beta_fit)) { + imp_mat <- varPro::get.beta.entropy(object, ...) + } else { + .validate_beta_uvarpro(beta_fit) + if (length(list(...)) > 0L) { + warning("gg_sdependent: arguments in '...' ignored because beta_fit is supplied.", + call. = FALSE) + } + imp_mat <- beta_fit + } + + prov <- list( + source = "varPro::sdependent", + family = "unsupv", + ntree = if (!is.null(object$ntree)) as.integer(object$ntree) else NA_integer_, + threshold = threshold, + q.signal = q.signal, + directed = isTRUE(directed), + precomputed = !is.null(beta_fit) + ) + + if (is.null(imp_mat) || !is.matrix(imp_mat) || nrow(imp_mat) == 0L || + ncol(imp_mat) == 0L) { + return(.gg_sdependent_empty(prov)) + } + + s <- varPro::sdependent(imp_mat, threshold = threshold, q.signal = q.signal, + directed = directed, min.degree = min.degree, + plot = FALSE) + + # sdependent() may return a character message when no graph is found. + if (is.character(s) || is.null(s$imp.score) || length(s$imp.score) == 0L) { + return(.gg_sdependent_empty(prov)) + } + + imp_score <- s$imp.score + vars <- names(imp_score) + if (is.null(vars)) vars <- as.character(seq_along(imp_score)) + degree <- if (!is.null(s$degree)) s$degree[vars] else rep(NA_real_, length(vars)) + signal_set <- s$signal.vars %||% character(0) + + ord <- order(imp_score, decreasing = TRUE) + ord_vars <- vars[ord] + + out <- data.frame( + variable = factor(ord_vars, levels = rev(ord_vars)), + imp_score = unname(imp_score[ord]), + degree = unname(as.numeric(degree)[ord]), + signal = ord_vars %in% signal_set, + stringsAsFactors = FALSE + ) + rownames(out) <- NULL + + class(out) <- c("gg_sdependent", "data.frame") + prov$n_var <- nrow(out) + prov$n_signal <- sum(out$signal) + attr(out, "provenance") <- prov + out +} + +#' @noRd +.gg_sdependent_empty <- function(prov) { + out <- data.frame( + variable = factor(character(0)), + imp_score = numeric(0), + degree = numeric(0), + signal = logical(0), + stringsAsFactors = FALSE + ) + class(out) <- c("gg_sdependent", "data.frame") + prov$n_var <- 0L + prov$n_signal <- 0L + attr(out, "provenance") <- prov + out +} + +#' @rdname print.gg +#' @export +print.gg_sdependent <- function(x, ...) { + prov <- attr(x, "provenance") + n_signal <- if (!is.null(prov)) prov$n_signal %||% sum(x$signal) else sum(x$signal) + thr <- if (!is.null(prov)) prov$threshold %||% NA_real_ else NA_real_ + cat(.gg_header(x, "gg_sdependent"), + sprintf(" | threshold: %.3g", thr), + sprintf(" | precomputed: %s", + isTRUE(if (!is.null(prov)) prov$precomputed else FALSE)), + "\n", + sprintf(" %d of %d variables flagged as signal\n", n_signal, nrow(x)), + sep = "") + invisible(x) +} + +#' @rdname summary.gg +#' @export +summary.gg_sdependent <- function(object, ...) { + sig <- as.character(object$variable[object$signal]) + body <- c( + sprintf("variables: %d (signal: %d)", nrow(object), sum(object$signal)), + sprintf("signal variables: %s", + if (length(sig)) paste(sig, collapse = ", ") else "(none)") + ) + .summary_skel(object, "gg_sdependent", body) +} + +#' @importFrom ggplot2 autoplot +#' @export +autoplot.gg_sdependent <- function(object, ...) { + plot.gg_sdependent(object, ...) +} diff --git a/R/plot.gg_sdependent.R b/R/plot.gg_sdependent.R new file mode 100644 index 00000000..88843311 --- /dev/null +++ b/R/plot.gg_sdependent.R @@ -0,0 +1,76 @@ +##============================================================================= +#' Plot a `gg_sdependent` object +#' +#' Ranked lollipop of the per-variable `sdependent()` signal score, sorted +#' descending so the strongest signal lands at the top. Points are coloured +#' blue for variables flagged as signal (`signal == TRUE`) and grey otherwise. +#' +#' @section Reading the chart: +#' Each lollipop is a variable's signal score from [varPro::sdependent()], +#' computed on the unsupervised entropy-region lasso structure of a +#' [varPro::uvarpro()] fit. Blue variables cleared the detection threshold and +#' are reported in `sdependent()$signal.vars`; grey ones did not. Pair with +#' [gg_udependent()] (the dependency graph) to see *how* the signal variables +#' connect, and with [gg_beta_uvarpro()] for the lasso-importance ranking. +#' +#' @param x A `gg_sdependent` object from [gg_sdependent()]. +#' @param ... Not currently used. +#' +#' @return A `ggplot` object. +#' +#' @seealso [gg_sdependent()], [gg_udependent()], [gg_beta_uvarpro()]. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("varPro", quietly = TRUE)) { +#' set.seed(1) +#' o <- varPro::uvarpro(mtcars, ntree = 50) +#' plot(gg_sdependent(o)) +#' } +#' } +#' +#' @name plot.gg_sdependent +#' @importFrom ggplot2 ggplot aes geom_segment geom_point coord_flip +#' @importFrom ggplot2 scale_color_manual labs theme_minimal +#' @export +plot.gg_sdependent <- function(x, ...) { + if (nrow(x) == 0L) { + stop("plot.gg_sdependent: nothing to plot (gg_sdependent has 0 rows).", + call. = FALSE) + } + prov <- attr(x, "provenance") + thr <- if (!is.null(prov)) prov$threshold %||% NA_real_ else NA_real_ + n_signal <- if (!is.null(prov)) prov$n_signal %||% sum(x$signal) else sum(x$signal) + + caption_txt <- sprintf( + "varPro::sdependent() signal score. %d of %d flagged as signal (threshold %.3g).", + n_signal, nrow(x), thr + ) + + ggplot2::ggplot( + x, + ggplot2::aes( + x = .data[["variable"]], + y = .data[["imp_score"]], + color = factor(.data[["signal"]]) + ) + ) + + ggplot2::geom_segment( + ggplot2::aes( + xend = .data[["variable"]], + yend = 0 + ) + ) + + ggplot2::geom_point(size = 3) + + ggplot2::coord_flip() + + ggplot2::scale_color_manual( + values = c("TRUE" = "#4e8fcd", "FALSE" = "#888888"), + guide = "none" + ) + + ggplot2::labs( + x = NULL, + y = "Signal score (sdependent)", + caption = caption_txt + ) + + ggplot2::theme_minimal() +} diff --git a/man/gg_sdependent.Rd b/man/gg_sdependent.Rd new file mode 100644 index 00000000..7b88468a --- /dev/null +++ b/man/gg_sdependent.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gg_sdependent.R +\name{gg_sdependent} +\alias{gg_sdependent} +\title{Signal-variable detection from an unsupervised varPro fit} +\usage{ +gg_sdependent( + object, + ..., + threshold = 0.25, + q.signal = 0.75, + directed = TRUE, + min.degree = NULL, + beta_fit = NULL +) +} +\arguments{ +\item{object}{A \code{uvarpro} object from \code{\link[varPro:uvarpro]{varPro::uvarpro()}}.} + +\item{...}{Forwarded to \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} when \code{beta_fit = NULL}; +ignored, with a warning, when \code{beta_fit} is supplied.} + +\item{threshold, q.signal, directed, min.degree}{Passed to +\code{\link[varPro:sdependent]{varPro::sdependent()}} (defaults match \code{\link[=gg_udependent]{gg_udependent()}}).} + +\item{beta_fit}{Optional precomputed \code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} matrix.} +} +\value{ +A \code{gg_sdependent} object (a \code{data.frame}), one row per candidate +variable, most-signal first, with columns: +\describe{ +\item{\code{variable}}{factor; levels reversed so the top variable lands at +the top after \code{coord_flip()}.} +\item{\code{imp_score}}{\code{sdependent()} per-variable signal score.} +\item{\code{degree}}{node degree in the dependency graph.} +\item{\code{signal}}{logical; variable is in \code{sdependent()$signal.vars}.} +} +The \code{provenance} attribute records \code{source}, \code{family} (\code{"unsupv"}), +\code{threshold}, \code{q.signal}, \code{directed}, \code{n_signal}, and \code{n_var}. +} +\description{ +Tidy wrapper around \code{\link[varPro:sdependent]{varPro::sdependent()}} for a \code{uvarpro} object. Where +\code{\link[=gg_udependent]{gg_udependent()}} draws the cross-variable dependency \emph{graph}, +\code{gg_sdependent()} surfaces \code{sdependent()}'s \emph{signal-variable detection}: a +ranked table of the per-variable signal score and graph degree, with the +variables flagged as "signal" (those whose dependency structure clears the +detection threshold). +} +\details{ +\code{sdependent()} runs on the \code{varPro::get.beta.entropy()} lasso-coefficient +matrix and returns, with \code{plot = FALSE}, a list of \code{imp.score} (per-variable +signal score), \code{degree} (node degree in the dependency graph), and +\code{signal.vars} (the detected signal set). This wrapper tidies that into one +row per candidate variable, ranked by \code{imp.score}. Because the entropy +matrix is the expensive part, \code{beta_fit} accepts a precomputed +\code{\link[varPro:get.beta.entropy]{varPro::get.beta.entropy()}} matrix (shared with \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}} and +\code{\link[=gg_udependent]{gg_udependent()}}). +} +\examples{ +\donttest{ +if (requireNamespace("varPro", quietly = TRUE)) { + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 50) + gg <- gg_sdependent(o) + plot(gg) +} +} + +} +\seealso{ +\code{\link[=gg_udependent]{gg_udependent()}} (the dependency graph), \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}} +(lasso importance), \code{\link[varPro:sdependent]{varPro::sdependent()}}, \code{\link[varPro:uvarpro]{varPro::uvarpro()}}. +} diff --git a/man/plot.gg_sdependent.Rd b/man/plot.gg_sdependent.Rd new file mode 100644 index 00000000..3164d938 --- /dev/null +++ b/man/plot.gg_sdependent.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.gg_sdependent.R +\name{plot.gg_sdependent} +\alias{plot.gg_sdependent} +\title{Plot a \code{gg_sdependent} object} +\usage{ +\method{plot}{gg_sdependent}(x, ...) +} +\arguments{ +\item{x}{A \code{gg_sdependent} object from \code{\link[=gg_sdependent]{gg_sdependent()}}.} + +\item{...}{Not currently used.} +} +\value{ +A \code{ggplot} object. +} +\description{ +Ranked lollipop of the per-variable \code{sdependent()} signal score, sorted +descending so the strongest signal lands at the top. Points are coloured +blue for variables flagged as signal (\code{signal == TRUE}) and grey otherwise. +} +\section{Reading the chart}{ + +Each lollipop is a variable's signal score from \code{\link[varPro:sdependent]{varPro::sdependent()}}, +computed on the unsupervised entropy-region lasso structure of a +\code{\link[varPro:uvarpro]{varPro::uvarpro()}} fit. Blue variables cleared the detection threshold and +are reported in \code{sdependent()$signal.vars}; grey ones did not. Pair with +\code{\link[=gg_udependent]{gg_udependent()}} (the dependency graph) to see \emph{how} the signal variables +connect, and with \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}} for the lasso-importance ranking. +} + +\examples{ +\donttest{ +if (requireNamespace("varPro", quietly = TRUE)) { + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 50) + plot(gg_sdependent(o)) +} +} + +} +\seealso{ +\code{\link[=gg_sdependent]{gg_sdependent()}}, \code{\link[=gg_udependent]{gg_udependent()}}, \code{\link[=gg_beta_uvarpro]{gg_beta_uvarpro()}}. +} diff --git a/man/print.gg.Rd b/man/print.gg.Rd index 1af3adba..7587f282 100644 --- a/man/print.gg.Rd +++ b/man/print.gg.Rd @@ -1,7 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gg_beta_uvarpro.R, R/print_methods.R +% Please edit documentation in R/gg_beta_uvarpro.R, R/gg_sdependent.R, +% R/print_methods.R \name{print.gg_beta_uvarpro} \alias{print.gg_beta_uvarpro} +\alias{print.gg_sdependent} \alias{print.gg} \alias{print.gg_error} \alias{print.gg_vimp} @@ -26,6 +28,8 @@ \usage{ \method{print}{gg_beta_uvarpro}(x, ...) +\method{print}{gg_sdependent}(x, ...) + \method{print}{gg_error}(x, ...) \method{print}{gg_vimp}(x, ...) diff --git a/man/summary.gg.Rd b/man/summary.gg.Rd index d695478e..d9bf59ea 100644 --- a/man/summary.gg.Rd +++ b/man/summary.gg.Rd @@ -1,7 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gg_beta_uvarpro.R, R/summary_methods.R +% Please edit documentation in R/gg_beta_uvarpro.R, R/gg_sdependent.R, +% R/summary_methods.R \name{summary.gg_beta_uvarpro} \alias{summary.gg_beta_uvarpro} +\alias{summary.gg_sdependent} \alias{summary.gg} \alias{print.summary.gg} \alias{summary.gg_error} @@ -26,6 +28,8 @@ \usage{ \method{summary}{gg_beta_uvarpro}(object, ...) +\method{summary}{gg_sdependent}(object, ...) + \method{print}{summary.gg}(x, ...) \method{summary}{gg_error}(object, ...) diff --git a/tests/testthat/test_gg_sdependent.R b/tests/testthat/test_gg_sdependent.R new file mode 100644 index 00000000..495d7d23 --- /dev/null +++ b/tests/testthat/test_gg_sdependent.R @@ -0,0 +1,110 @@ +# Tests for gg_sdependent() — the sdependent() signal-detection wrapper. +# +# gg_sdependent() calls varPro::sdependent() on a get.beta.entropy() matrix. +# sdependent() does NOT grow a forest (no UBSAN path), so the matrix-driven +# tests run on CRAN with skip_if_not_installed("varPro"); only the live +# uvarpro() grow is skip_on_cran(). + +# A get.beta.entropy()-shaped matrix: square, named, NA diagonal. +.mock_entropy_sq <- function() { + v <- c("a", "b", "c", "d") + matrix(c(NA, .6, .1, .7, + .5, NA, .2, .8, + .1, .3, NA, .2, + .6, .7, .1, NA), + nrow = 4, byrow = TRUE, dimnames = list(v, v)) +} +.stub_uvarpro <- function(ntree = 50L) { + structure(list(ntree = ntree), class = "uvarpro") +} + +test_that("gg_sdependent returns the expected tidy shape", { + skip_if_not_installed("varPro") + out <- gg_sdependent(.stub_uvarpro(), beta_fit = .mock_entropy_sq()) + expect_s3_class(out, "gg_sdependent") + expect_named(out, c("variable", "imp_score", "degree", "signal")) + expect_s3_class(out$variable, "factor") + expect_type(out$imp_score, "double") + expect_type(out$signal, "logical") +}) + +test_that("rows are ranked by imp_score; top is last factor level", { + skip_if_not_installed("varPro") + out <- gg_sdependent(.stub_uvarpro(), beta_fit = .mock_entropy_sq()) + expect_false(is.unsorted(rev(out$imp_score))) + top <- as.character(out$variable[which.max(out$imp_score)]) + expect_equal(tail(levels(out$variable), 1), top) +}) + +test_that("signal flag matches sdependent()$signal.vars", { + skip_if_not_installed("varPro") + m <- .mock_entropy_sq() + s <- varPro::sdependent(m, plot = FALSE) + out <- gg_sdependent(.stub_uvarpro(), beta_fit = m) + flagged <- as.character(out$variable[out$signal]) + expect_setequal(flagged, s$signal.vars) +}) + +test_that("provenance records the sdependent source and parameters", { + skip_if_not_installed("varPro") + prov <- attr(gg_sdependent(.stub_uvarpro(60L), beta_fit = .mock_entropy_sq(), + threshold = 0.3), + "provenance") + expect_equal(prov$source, "varPro::sdependent") + expect_equal(prov$family, "unsupv") + expect_equal(prov$ntree, 60L) + expect_equal(prov$threshold, 0.3) + expect_true(prov$precomputed) +}) + +test_that("... is ignored (with a warning) when beta_fit is supplied", { + skip_if_not_installed("varPro") + expect_warning( + gg_sdependent(.stub_uvarpro(), beta_fit = .mock_entropy_sq(), + pre.filter = FALSE), + "ignored because beta_fit is supplied" + ) +}) + +test_that("malformed beta_fit and non-uvarpro input are rejected", { + expect_error( + gg_sdependent(.stub_uvarpro(), beta_fit = list()), + "does not look like a varPro::get.beta.entropy" + ) + expect_error( + gg_sdependent(structure(list(), class = "varpro")), + "expected a 'uvarpro' object" + ) +}) + +test_that("empty beta matrix yields an empty gg_sdependent frame", { + out <- gg_sdependent(.stub_uvarpro(), beta_fit = matrix(numeric(0), 0, 0)) + expect_s3_class(out, "gg_sdependent") + expect_equal(nrow(out), 0L) + expect_named(out, c("variable", "imp_score", "degree", "signal")) + expect_error(plot(out), "nothing to plot") +}) + +test_that("print / summary / autoplot companions work", { + skip_if_not_installed("varPro") + out <- gg_sdependent(.stub_uvarpro(), beta_fit = .mock_entropy_sq()) + expect_output(print(out), "gg_sdependent") + expect_output(print(out), "flagged as signal") + s <- summary(out) + expect_s3_class(s, "summary.gg_sdependent") + expect_true(any(grepl("signal variables", s$body))) + expect_s3_class(ggplot2::autoplot(out), "ggplot") +}) + +# ---- live integration (skipped on CRAN: uvarpro grows a forest) ----------- +test_that("gg_sdependent agrees with a live uvarpro + sdependent fit", { + skip_on_cran() + skip_if_not_installed("varPro") + set.seed(1) + o <- varPro::uvarpro(mtcars, ntree = 30) + out <- gg_sdependent(o) + expect_s3_class(out, "gg_sdependent") + b <- varPro::get.beta.entropy(o) + s <- varPro::sdependent(b, plot = FALSE) + expect_setequal(as.character(out$variable[out$signal]), s$signal.vars) +})