From 1ef609162089418cfa45f14e6a3173caeed022ee Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 12 Dec 2024 14:49:59 -0800 Subject: [PATCH 01/14] [r] add tf-idf and log normalization functions --- r/NAMESPACE | 2 + r/NEWS.md | 1 + r/R/transforms.R | 54 +++++++++++++++++++++++ r/man/normalize_log.Rd | 21 +++++++++ r/man/normalize_tfidf.Rd | 21 +++++++++ r/pkgdown/_pkgdown.yml | 2 + r/tests/testthat/test-matrix_transforms.R | 46 +++++++++++++++++++ 7 files changed, 147 insertions(+) create mode 100644 r/man/normalize_log.Rd create mode 100644 r/man/normalize_tfidf.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 8625c143..622d3a88 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -65,6 +65,8 @@ export(min_by_row) export(min_scalar) export(multiply_cols) export(multiply_rows) +export(normalize_log) +export(normalize_tfidf) export(nucleosome_counts) export(open_fragments_10x) export(open_fragments_dir) diff --git a/r/NEWS.md b/r/NEWS.md index 9715c95a..890f2755 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -22,6 +22,7 @@ Contributions welcome :) - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices. If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) +- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/transforms.R b/r/R/transforms.R index 2a5de994..7d1c46f8 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -923,3 +923,57 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { vars_to_regress = vars_to_regress ) } + +################# +# Normalizations +################# + +#' Normalize a matrix using log normalization +#' @param mat (IterableMatrix) Matrix to normalize +#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization +#' @param add_one (logical) Add one to the matrix before log normalization +#' @returns log normalized matrix. +#' @export +normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) { + assert_is(mat, "IterableMatrix") + assert_is_numeric(scale_factor) + assert_true(is.logical(add_one)) + assert_greater_than_zero(scale_factor) + mat <- mat * scale_factor + if (!add_one) mat <- mat - 1 + return(log1p(mat)) +} + + +#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency +#' @param mat (IterableMatrix) to normalize +#' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then +#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +#' Else, map each feature name to its mean value. +#' @returns tf-idf normalized matrix. +#' @export +normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(threads) + # If feature means are passed in, only need to calculate term frequency + if (is.null(feature_means)) { + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + feature_means <- mat_stats$row_stats["mean", ] + read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) + } else { + assert_is_numeric(feature_means) + if (!is.null(names(feature_means)) && !is.null(rownames(mat))) { + # Make sure every name in feature means exists in rownames(mat) + # In the case there is a length mismatch but the feature names all exist in feature_means + # will not error out + assert_true(all(rownames(mat) %in% names(feature_means))) + feature_means <- feature_means[rownames(mat)] + } else { + assert_len(feature_means, nrow(mat)) + } + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat) + } + tf <- mat %>% multiply_cols(1 / read_depth) + idf <- 1 / feature_means + return(tf %>% multiply_rows(idf)) +} \ No newline at end of file diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd new file mode 100644 index 00000000..90a57f85 --- /dev/null +++ b/r/man/normalize_log.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_log} +\alias{normalize_log} +\title{Normalize a matrix using log normalization} +\usage{ +normalize_log(mat, scale_factor = 10000, add_one = TRUE) +} +\arguments{ +\item{mat}{(IterableMatrix) Matrix to normalize} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization} + +\item{add_one}{(logical) Add one to the matrix before log normalization} +} +\value{ +log normalized matrix. +} +\description{ +Normalize a matrix using log normalization +} diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd new file mode 100644 index 00000000..bf6a34ef --- /dev/null +++ b/r/man/normalize_tfidf.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_tfidf} +\alias{normalize_tfidf} +\title{Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency} +\usage{ +normalize_tfidf(mat, feature_means = NULL, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) to normalize} + +\item{feature_means}{(numeric) Means of the features to normalize by. If no names are provided, then +each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +Else, map each feature name to its mean value.} +} +\value{ +tf-idf normalized matrix. +} +\description{ +Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index ea73ec01..22237598 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -126,6 +126,8 @@ reference: - checksum - apply_by_row - regress_out + - normalize_log + - normalize_tfidf - IterableMatrix-methods - pseudobulk_matrix diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 385605e0..b1941f8b 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -346,3 +346,49 @@ test_that("linear regression works", { expect_equal(as(m1, "matrix"), ans) expect_equal(as(m1t, "matrix"), ans) }) + +test_that("tf-idf normalization works", { + m <- generate_sparse_matrix(5, 5) + rownames(m) <- paste0("row", seq_len(nrow(m))) + rev_rownames <- rev(rownames(m)) + # Create tf-idf normalization for dgCMatrix + res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix") + + rownames(res_dgc) <- rownames(m) + m2 <- as(m, "IterableMatrix") + # Check that we can pass in row means as a (named) vector + row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",] + # Test that row means ordering does not matter as long as names exist + row_means_shuffled <- row_means[sample(1:length(row_means))] + # Test that row means can have an extra element as long as all rownames are in the vector + row_means_plus_one <- c(row_means, row6 = 1) + + + res <- normalize_tfidf(m2) + expect_equal(res %>% as("dgCMatrix"), res_dgc) + res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) + expect_identical(res, res_with_row_means) + + res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled) + expect_identical(res_with_row_means, res_with_shuffled_row_means, res) + + res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one) + expect_identical(res, res_with_row_means_with_extra_element) +}) + +test_that("normalize_log works", { + m <- generate_sparse_matrix(5, 5) + m2 <- as(m, "IterableMatrix") + # Test that default params yield the same as log1p on dgCMatrix + res_1 <- as(normalize_log(m2), "dgCMatrix") + expect_equal(res_1, log1p(m*1e4)) + + # Test that changing scale factor works + res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") + expect_identical(res_2, log1p(m*1e5)) + # Test that removing the add_one works + # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. + res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") + res_3@x[res_3@x < -700] <- -Inf + expect_identical(as(res_3, "dgeMatrix"), log(m*1e4)) +}) \ No newline at end of file From 98675d0186f2fa7f016543a9f454c3578a5260ef Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 12 Dec 2024 15:13:11 -0800 Subject: [PATCH 02/14] [r] fix normalization tests --- r/tests/testthat/test-matrix_transforms.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index b1941f8b..06bc3337 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -381,14 +381,14 @@ test_that("normalize_log works", { m2 <- as(m, "IterableMatrix") # Test that default params yield the same as log1p on dgCMatrix res_1 <- as(normalize_log(m2), "dgCMatrix") - expect_equal(res_1, log1p(m*1e4)) + expect_equal(res_1, log1p(m*1e4), tolerance = 1e-6) # Test that changing scale factor works res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") - expect_identical(res_2, log1p(m*1e5)) + expect_equal(res_2, log1p(m*1e5), tolerance = 1e-6) # Test that removing the add_one works # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") - res_3@x[res_3@x < -700] <- -Inf - expect_identical(as(res_3, "dgeMatrix"), log(m*1e4)) + res_3@x[res_3@x < -60] <- -Inf + expect_equal(as(res_3, "dgeMatrix"), log(m*1e4), tolerance = 1e-6) }) \ No newline at end of file From 2f83ae647174ea718f0c05505487ab2ebb55393c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 13 Dec 2024 17:09:56 -0800 Subject: [PATCH 03/14] [r] add in requested changes --- r/R/transforms.R | 38 ++++++++++++++--------- r/man/normalize_log.Rd | 16 +++++----- r/man/normalize_tfidf.Rd | 16 +++++++--- r/pkgdown/_pkgdown.yml | 5 +++ r/tests/testthat/test-matrix_transforms.R | 12 +++---- 5 files changed, 52 insertions(+), 35 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index 7d1c46f8..b2dda267 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -928,31 +928,38 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { # Normalizations ################# -#' Normalize a matrix using log normalization -#' @param mat (IterableMatrix) Matrix to normalize -#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization -#' @param add_one (logical) Add one to the matrix before log normalization -#' @returns log normalized matrix. +#' Normalize a `(features x cells)` matrix using log normalization. +#' @param mat (IterableMatrix) Matrix to normalize. +#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization. +#' @param threads (integer) Number of threads to use.s +#' @returns log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} #' @export -normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) { +normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE, threads = 1L) { assert_is(mat, "IterableMatrix") assert_is_numeric(scale_factor) assert_true(is.logical(add_one)) assert_greater_than_zero(scale_factor) - mat <- mat * scale_factor - if (!add_one) mat <- mat - 1 - return(log1p(mat)) + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) + mat <- mat %>% multiply_cols(1 / read_depth) + return(log1p(mat * scale_factor)) } -#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency -#' @param mat (IterableMatrix) to normalize +#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency. #' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then #' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. #' Else, map each feature name to its mean value. -#' @returns tf-idf normalized matrix. +#' @returns tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +#' @inheritParams normalize_log #' @export -normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { +normalize_tfidf <- function( + mat, feature_means = NULL, + scale_factor = 1e4, threads = 1L +) { assert_is(mat, "IterableMatrix") assert_is_wholenumber(threads) # If feature means are passed in, only need to calculate term frequency @@ -971,9 +978,10 @@ normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { } else { assert_len(feature_means, nrow(mat)) } - read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat) + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) } tf <- mat %>% multiply_cols(1 / read_depth) idf <- 1 / feature_means - return(tf %>% multiply_rows(idf)) + tf_idf_mat <- tf %>% multiply_rows(idf) + return(log1p(tf_idf_mat * scale_factor)) } \ No newline at end of file diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd index 90a57f85..9de93071 100644 --- a/r/man/normalize_log.Rd +++ b/r/man/normalize_log.Rd @@ -2,20 +2,22 @@ % Please edit documentation in R/transforms.R \name{normalize_log} \alias{normalize_log} -\title{Normalize a matrix using log normalization} +\title{Normalize a \verb{(features x cells)} matrix using log normalization.} \usage{ -normalize_log(mat, scale_factor = 10000, add_one = TRUE) +normalize_log(mat, scale_factor = 10000, add_one = TRUE, threads = 1L) } \arguments{ -\item{mat}{(IterableMatrix) Matrix to normalize} +\item{mat}{(IterableMatrix) Matrix to normalize.} -\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization} +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} -\item{add_one}{(logical) Add one to the matrix before log normalization} +\item{threads}{(integer) Number of threads to use.s} } \value{ -log normalized matrix. +log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} } \description{ -Normalize a matrix using log normalization +Normalize a \verb{(features x cells)} matrix using log normalization. } diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd index bf6a34ef..8dc50b84 100644 --- a/r/man/normalize_tfidf.Rd +++ b/r/man/normalize_tfidf.Rd @@ -2,20 +2,26 @@ % Please edit documentation in R/transforms.R \name{normalize_tfidf} \alias{normalize_tfidf} -\title{Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency} +\title{Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.} \usage{ -normalize_tfidf(mat, feature_means = NULL, threads = 1L) +normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) } \arguments{ -\item{mat}{(IterableMatrix) to normalize} +\item{mat}{(IterableMatrix) Matrix to normalize.} \item{feature_means}{(numeric) Means of the features to normalize by. If no names are provided, then each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. Else, map each feature name to its mean value.} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} + +\item{threads}{(integer) Number of threads to use.s} } \value{ -tf-idf normalized matrix. +tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} } \description{ -Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency +Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency. } diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 22237598..ae2772eb 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -9,6 +9,11 @@ template: includes: in_header: | + + + + + after_body: | diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 06bc3337..c4c5edb5 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -352,7 +352,7 @@ test_that("tf-idf normalization works", { rownames(m) <- paste0("row", seq_len(nrow(m))) rev_rownames <- rev(rownames(m)) # Create tf-idf normalization for dgCMatrix - res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix") + res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4) rownames(res_dgc) <- rownames(m) m2 <- as(m, "IterableMatrix") @@ -378,17 +378,13 @@ test_that("tf-idf normalization works", { test_that("normalize_log works", { m <- generate_sparse_matrix(5, 5) + res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix") m2 <- as(m, "IterableMatrix") # Test that default params yield the same as log1p on dgCMatrix res_1 <- as(normalize_log(m2), "dgCMatrix") - expect_equal(res_1, log1p(m*1e4), tolerance = 1e-6) + expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6) # Test that changing scale factor works res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") - expect_equal(res_2, log1p(m*1e5), tolerance = 1e-6) - # Test that removing the add_one works - # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. - res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") - res_3@x[res_3@x < -60] <- -Inf - expect_equal(as(res_3, "dgeMatrix"), log(m*1e4), tolerance = 1e-6) + expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6) }) \ No newline at end of file From 6381f74d02982c7972198aa32a042a99b0808069 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 13 Dec 2024 17:15:56 -0800 Subject: [PATCH 04/14] [r] removed unused variable --- r/R/transforms.R | 3 +-- r/man/normalize_log.Rd | 2 +- r/tests/testthat/test-matrix_transforms.R | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index b2dda267..8a2bd25b 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -936,10 +936,9 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { #' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: #' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} #' @export -normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE, threads = 1L) { +normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { assert_is(mat, "IterableMatrix") assert_is_numeric(scale_factor) - assert_true(is.logical(add_one)) assert_greater_than_zero(scale_factor) read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) mat <- mat %>% multiply_cols(1 / read_depth) diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd index 9de93071..97d8c92e 100644 --- a/r/man/normalize_log.Rd +++ b/r/man/normalize_log.Rd @@ -4,7 +4,7 @@ \alias{normalize_log} \title{Normalize a \verb{(features x cells)} matrix using log normalization.} \usage{ -normalize_log(mat, scale_factor = 10000, add_one = TRUE, threads = 1L) +normalize_log(mat, scale_factor = 10000, threads = 1L) } \arguments{ \item{mat}{(IterableMatrix) Matrix to normalize.} diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index c4c5edb5..67641e54 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -365,7 +365,7 @@ test_that("tf-idf normalization works", { res <- normalize_tfidf(m2) - expect_equal(res %>% as("dgCMatrix"), res_dgc) + expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6) res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) expect_identical(res, res_with_row_means) From 8e80dc50b52e15b4fa12adfcd62e785a8a7b75f1 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Sat, 14 Dec 2024 19:22:16 -0800 Subject: [PATCH 05/14] [r] add feature selection methods --- r/NAMESPACE | 2 + r/NEWS.md | 1 + r/R/singlecell_utils.R | 113 +++++++++++++++++++++++ r/man/select_features_by_dispersion.Rd | 42 +++++++++ r/man/select_features_by_mean.Rd | 34 +++++++ r/man/select_features_by_variance.Rd | 41 ++++++++ r/pkgdown/_pkgdown.yml | 3 + r/tests/testthat/test-singlecell_utils.R | 21 ++++- 8 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 r/man/select_features_by_dispersion.Rd create mode 100644 r/man/select_features_by_mean.Rd create mode 100644 r/man/select_features_by_variance.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 622d3a88..c90e1a15 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -110,6 +110,8 @@ export(rowVars.default) export(sctransform_pearson) export(select_cells) export(select_chromosomes) +export(select_features_by_mean) +export(select_features_by_variance) export(select_regions) export(set_trackplot_height) export(set_trackplot_label) diff --git a/r/NEWS.md b/r/NEWS.md index 890f2755..73ca847d 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -23,6 +23,7 @@ Contributions welcome :) If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) +- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169) ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 0fa1711d..cd7bc554 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -6,6 +6,118 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. + +################# +# Feature selection +################# + +#' Get the most variable features within a matrix. +#' @param mat (IterableMatrix) dimensions features x cells +#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, +#' all features will be returned. +#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed. +#' @param threads (integer) Number of threads to use. +#' @returns +#' Return a dataframe with the following columns, sorted descending by variance: +#' - `names`: Feature name. +#' - `score`: Variance of the feature. +#' - `highly_variable`: Logical vector of whether the feature is highly variable. +#' @details +#' Calculate using the following process: +#' 1. Perform an optional term frequency + log normalization, for each feature. +#' 2. Find `num_feats` features with the highest variance across clusters. +#' @export +select_features_by_variance <- function( + mat, num_feats = 25000, + normalize = normalize_log, + threads = 1L +) { + assert_is(mat, "IterableMatrix") + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is(num_feats, "numeric") + num_feats <- min(max(num_feats, 0), nrow(mat)) + + if (!is.null(normalize)) mat <- normalize(mat, threads = threads) + features_df <- tibble::tibble( + names = rownames(mat), + score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",] + ) %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) + return(features_df) +} + + +#' Get the features with the highest dispersion within a matrix. +#' @returns +#' Return a dataframe with the following columns, sorted descending by dispersion: +#' - `names`: Feature name. +#' - `score`: Variance of the feature. +#' - `highly_variable`: Logical vector of whether the feature is highly variable. +#' @inheritParams select_features_by_variance +#' @details +#' Calculate using the following process: +#' 1. Perform an optional term frequency + log normalization, for each feature. +#' 2. Find the dispersion (variance/mean) of each feature. +#' 3. Find `num_feats` features with the highest dispersion. +select_features_by_dispersion <- function( + mat, num_feats = 25000, + normalize = normalize_log, + threads = 1L +) { + assert_is(mat, "IterableMatrix") + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is(num_feats, "numeric") + num_feats <- min(max(num_feats, 0), nrow(mat)) + + if (!is.null(normalize)) mat <- normalize(mat, threads = threads) + mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads) + features_df <- tibble::tibble( + names = rownames(mat), + score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ] + ) %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) + return(features_df) +} + + +#' Get the top features from a matrix, based on the mean accessibility of each feature. +#' @param num_feats Number of features to deem as highly accessible. If the number is higher than the number of features in the matrix, +#' all features will be returned. +#' @inheritParams select_features_by_variance +#' @returns +#' Return a dataframe with the following columns, sorted descending by mean accessibility: +#' - `names`: Feature name. +#' - `score`: Binarize sum of each feature. +#' - `highly_variable`: Logical vector of whether the feature is highly accessible by mean accessibility. +#' @details +#' Calculate using the following process: +#' 1. Get the sum of each binarized feature. +#' 2. Find `num_feats` features with the highest accessibility. +#' @export +select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(num_feats) + assert_greater_than_zero(num_feats) + assert_is(num_feats, "numeric") + num_feats <- min(max(num_feats, 0), nrow(mat)) + # get the sum of each feature, binarized + # get the top features + features_df <- tibble::tibble( + names = rownames(mat), + score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ] + ) %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) + return(features_df) +} + + #' Test for marker features #' #' Given a features x cells matrix, perform one-vs-all differential @@ -70,6 +182,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { ) } + #' Aggregate counts matrices by cell group or feature. #' #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd new file mode 100644 index 00000000..5d0bc177 --- /dev/null +++ b/r/man/select_features_by_dispersion.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{select_features_by_dispersion} +\alias{select_features_by_dispersion} +\title{Get the features with the highest dispersion within a matrix.} +\usage{ +select_features_by_dispersion( + mat, + num_feats = 25000, + normalize = normalize_log, + threads = 1L +) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +Return a dataframe with the following columns, sorted descending by dispersion: +\itemize{ +\item \code{names}: Feature name. +\item \code{score}: Variance of the feature. +\item \code{highly_variable}: Logical vector of whether the feature is highly variable. +} +} +\description{ +Get the features with the highest dispersion within a matrix. +} +\details{ +Calculate using the following process: +\enumerate{ +\item Perform an optional term frequency + log normalization, for each feature. +\item Find the dispersion (variance/mean) of each feature. +\item Find \code{num_feats} features with the highest dispersion. +} +} diff --git a/r/man/select_features_by_mean.Rd b/r/man/select_features_by_mean.Rd new file mode 100644 index 00000000..c05b0acb --- /dev/null +++ b/r/man/select_features_by_mean.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{select_features_by_mean} +\alias{select_features_by_mean} +\title{Get the top features from a matrix, based on the mean accessibility of each feature.} +\usage{ +select_features_by_mean(mat, num_feats = 25000, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{Number of features to deem as highly accessible. If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +Return a dataframe with the following columns, sorted descending by mean accessibility: +\itemize{ +\item \code{names}: Feature name. +\item \code{score}: Binarize sum of each feature. +\item \code{highly_variable}: Logical vector of whether the feature is highly accessible by mean accessibility. +} +} +\description{ +Get the top features from a matrix, based on the mean accessibility of each feature. +} +\details{ +Calculate using the following process: +\enumerate{ +\item Get the sum of each binarized feature. +\item Find \code{num_feats} features with the highest accessibility. +} +} diff --git a/r/man/select_features_by_variance.Rd b/r/man/select_features_by_variance.Rd new file mode 100644 index 00000000..b7cc375f --- /dev/null +++ b/r/man/select_features_by_variance.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{select_features_by_variance} +\alias{select_features_by_variance} +\title{Get the most variable features within a matrix.} +\usage{ +select_features_by_variance( + mat, + num_feats = 25000, + normalize = normalize_log, + threads = 1L +) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +Return a dataframe with the following columns, sorted descending by variance: +\itemize{ +\item \code{names}: Feature name. +\item \code{score}: Variance of the feature. +\item \code{highly_variable}: Logical vector of whether the feature is highly variable. +} +} +\description{ +Get the most variable features within a matrix. +} +\details{ +Calculate using the following process: +\enumerate{ +\item Perform an optional term frequency + log normalization, for each feature. +\item Find \code{num_feats} features with the highest variance across clusters. +} +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index ae2772eb..526018e4 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -133,6 +133,9 @@ reference: - regress_out - normalize_log - normalize_tfidf + - select_features_by_variance + - select_features_by_dispersion + - select_features_by_mean - IterableMatrix-methods - pseudobulk_matrix diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 8bae8966..8117ed02 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -13,6 +13,24 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = as(m, "dgCMatrix") } +test_that("select_features works general case", { + m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix") + for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) { + res <- do.call(fn, list(m1, num_feats = 10)) + expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting + expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable + expect_setequal(res$names, rownames(m1)) + res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows + res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) + expect_identical(res_more_feats_than_rows, res_feats_equal_rows) + if (fn != "select_features_by_mean") { + # Check that normalization actually does something + res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL)) + expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score)) + } + } +}) + test_that("Wilcoxon rank sum works (small)", { x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46) y <- c(1.15, 0.88, 0.90, 0.74, 1.21) @@ -160,4 +178,5 @@ test_that("Pseudobulk aggregation works with multiple return types", { } } } -}) \ No newline at end of file +}) + From c50ead2da9d33b375f98c4599ea4152d55a9b8be Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 10 Jan 2025 02:48:40 -0800 Subject: [PATCH 06/14] [r] update select_features_by_dispersion() to reflect archr defaults --- r/R/singlecell_utils.R | 2 +- r/man/select_features_by_dispersion.Rd | 2 +- r/tests/testthat/test-singlecell_utils.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index cd7bc554..32e45934 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -64,7 +64,7 @@ select_features_by_variance <- function( #' 3. Find `num_feats` features with the highest dispersion. select_features_by_dispersion <- function( mat, num_feats = 25000, - normalize = normalize_log, + normalize = NULL, threads = 1L ) { assert_is(mat, "IterableMatrix") diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd index 5d0bc177..2835c9a8 100644 --- a/r/man/select_features_by_dispersion.Rd +++ b/r/man/select_features_by_dispersion.Rd @@ -7,7 +7,7 @@ select_features_by_dispersion( mat, num_feats = 25000, - normalize = normalize_log, + normalize = NULL, threads = 1L ) } diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 8117ed02..111324b7 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -23,7 +23,7 @@ test_that("select_features works general case", { res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) expect_identical(res_more_feats_than_rows, res_feats_equal_rows) - if (fn != "select_features_by_mean") { + if (fn == "select_features_by_variance") { # Check that normalization actually does something res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL)) expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score)) From 199ae82355991e882de343128abdf62b99bc504b Mon Sep 17 00:00:00 2001 From: Ben Parks Date: Fri, 10 Jan 2025 16:51:00 -0800 Subject: [PATCH 07/14] Update docs --- r/R/transforms.R | 28 ++++++++++++++----------- r/man/normalize.Rd | 45 ++++++++++++++++++++++++++++++++++++++++ r/man/normalize_log.Rd | 23 -------------------- r/man/normalize_tfidf.Rd | 27 ------------------------ r/pkgdown/_pkgdown.yml | 22 +++++++++++--------- 5 files changed, 73 insertions(+), 72 deletions(-) create mode 100644 r/man/normalize.Rd delete mode 100644 r/man/normalize_log.Rd delete mode 100644 r/man/normalize_tfidf.Rd diff --git a/r/R/transforms.R b/r/R/transforms.R index 8a2bd25b..e79c404d 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -928,13 +928,19 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { # Normalizations ################# -#' Normalize a `(features x cells)` matrix using log normalization. -#' @param mat (IterableMatrix) Matrix to normalize. +#' Normalization recipes +#' +#' Apply standard normalizations to a `(features x cells)` counts matrix. +#' +#' @rdname normalize +#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)` #' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization. -#' @param threads (integer) Number of threads to use.s -#' @returns log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, -#' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: -#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +#' @param threads (integer) Number of threads to use. +#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as: +#' +#' - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog` #' @export normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { assert_is(mat, "IterableMatrix") @@ -946,14 +952,12 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { } -#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency. -#' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then +#' @rdname normalize +#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then #' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. #' Else, map each feature name to its mean value. -#' @returns tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, -#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: -#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} -#' @inheritParams normalize_log +#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()` #' @export normalize_tfidf <- function( mat, feature_means = NULL, diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd new file mode 100644 index 00000000..f4bde193 --- /dev/null +++ b/r/man/normalize.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_log} +\alias{normalize_log} +\alias{normalize_tfidf} +\title{Normalization recipes} +\usage{ +normalize_log(mat, scale_factor = 10000, threads = 1L) + +normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) Counts matrix to normalize. \verb{(features x cells)}} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} + +\item{threads}{(integer) Number of threads to use.} + +\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then +each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +Else, map each feature name to its mean value.} +} +\value{ +For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as: +\itemize{ +\item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +} + +\itemize{ +\item \code{normalize_tfidf}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +} +} +\description{ +Apply standard normalizations to a \verb{(features x cells)} counts matrix. +} +\details{ +\itemize{ +\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeLog} +} + +\itemize{ +\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, used by default in \code{ArchR::addIterativeLSI()} and \code{Signac::RunTFIDF()} +} +} diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd deleted file mode 100644 index 97d8c92e..00000000 --- a/r/man/normalize_log.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transforms.R -\name{normalize_log} -\alias{normalize_log} -\title{Normalize a \verb{(features x cells)} matrix using log normalization.} -\usage{ -normalize_log(mat, scale_factor = 10000, threads = 1L) -} -\arguments{ -\item{mat}{(IterableMatrix) Matrix to normalize.} - -\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} - -\item{threads}{(integer) Number of threads to use.s} -} -\value{ -log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, -the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: -\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} -} -\description{ -Normalize a \verb{(features x cells)} matrix using log normalization. -} diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd deleted file mode 100644 index 8dc50b84..00000000 --- a/r/man/normalize_tfidf.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transforms.R -\name{normalize_tfidf} -\alias{normalize_tfidf} -\title{Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.} -\usage{ -normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) -} -\arguments{ -\item{mat}{(IterableMatrix) Matrix to normalize.} - -\item{feature_means}{(numeric) Means of the features to normalize by. If no names are provided, then -each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. -Else, map each feature name to its mean value.} - -\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} - -\item{threads}{(integer) Number of threads to use.s} -} -\value{ -tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, -the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: -\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} -} -\description{ -Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency. -} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index ae2772eb..52d03343 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -131,20 +131,14 @@ reference: - checksum - apply_by_row - regress_out - - normalize_log - - normalize_tfidf - IterableMatrix-methods - pseudobulk_matrix -- title: "Reference Annotations" +- title: "Single-cell analysis helpers" +- subtitle: "Dimensionality reduction" - contents: - - human_gene_mapping - - match_gene_symbol - - read_gtf - - read_bed - - read_ucsc_chrom_sizes - -- title: "Clustering" + - normalize_log +- subtitle: "Clustering" - contents: - knn_hnsw - cluster_graph_leiden @@ -182,3 +176,11 @@ reference: - discrete_palette - collect_features - rotate_x_labels + +- title: "Reference Annotations" +- contents: + - human_gene_mapping + - match_gene_symbol + - read_gtf + - read_bed + - read_ucsc_chrom_sizes From 553f262a604e4f366089ff9741e612b8f47821fe Mon Sep 17 00:00:00 2001 From: Ben Parks Date: Fri, 10 Jan 2025 16:51:58 -0800 Subject: [PATCH 08/14] Update NEWS --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index 07890a0e..ac2109fb 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -10,6 +10,7 @@ Contributions welcome :) ## Features - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166) +- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168) ## Bug-fixes - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) From 7511f0b7e32b7ee7a726ddd73912ac272f3e0d79 Mon Sep 17 00:00:00 2001 From: Ben Parks Date: Fri, 10 Jan 2025 17:00:53 -0800 Subject: [PATCH 09/14] Update docs --- r/R/transforms.R | 2 +- r/man/normalize.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index e79c404d..05759f9e 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -928,7 +928,7 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { # Normalizations ################# -#' Normalization recipes +#' Normalization helper functions #' #' Apply standard normalizations to a `(features x cells)` counts matrix. #' diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd index f4bde193..ac53f2c0 100644 --- a/r/man/normalize.Rd +++ b/r/man/normalize.Rd @@ -3,7 +3,7 @@ \name{normalize_log} \alias{normalize_log} \alias{normalize_tfidf} -\title{Normalization recipes} +\title{Normalization helper functions} \usage{ normalize_log(mat, scale_factor = 10000, threads = 1L) From 435724b4dbccdd9fefa89778a045f58b1b360bd7 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 13 Jan 2025 16:46:14 -0800 Subject: [PATCH 10/14] [r] add partial args to normalizations --- r/R/transforms.R | 17 +++++++++++++++++ r/tests/testthat/test-matrix_transforms.R | 3 +++ 2 files changed, 20 insertions(+) diff --git a/r/R/transforms.R b/r/R/transforms.R index 05759f9e..b097e898 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -943,6 +943,14 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog` #' @export normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { + if (rlang::is_missing(mat)) { + return( + purrr::partial( + normalize_log, + scale_factor = scale_factor, threads = threads + ) + ) + } assert_is(mat, "IterableMatrix") assert_is_numeric(scale_factor) assert_greater_than_zero(scale_factor) @@ -963,6 +971,15 @@ normalize_tfidf <- function( mat, feature_means = NULL, scale_factor = 1e4, threads = 1L ) { + if (rlang::is_missing(mat)) { + return( + purrr::partial( + normalize_tfidf, + feature_means = feature_means, scale_factor = scale_factor, + threads = threads + ) + ) + } assert_is(mat, "IterableMatrix") assert_is_wholenumber(threads) # If feature means are passed in, only need to calculate term frequency diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 67641e54..24cd9c23 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -367,6 +367,8 @@ test_that("tf-idf normalization works", { res <- normalize_tfidf(m2) expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6) res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) + res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2) + expect_equal(res_with_row_means, res_with_row_means_partial) expect_identical(res, res_with_row_means) res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled) @@ -386,5 +388,6 @@ test_that("normalize_log works", { # Test that changing scale factor works res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") + res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix") expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6) }) \ No newline at end of file From 8dbe8e52be1d5e79aa895deeb095fd80314c980c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 13 Jan 2025 19:18:45 -0800 Subject: [PATCH 11/14] [r] create mechanism for partial calls on explicit args --- r/R/transforms.R | 13 ++++++------- r/R/utils.R | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index b097e898..02ab51b8 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -943,11 +943,11 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog` #' @export normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { + # browser() if (rlang::is_missing(mat)) { return( - purrr::partial( - normalize_log, - scale_factor = scale_factor, threads = threads + partial_explicit( + normalize_log, scale_factor = scale_factor, threads = threads ) ) } @@ -973,10 +973,9 @@ normalize_tfidf <- function( ) { if (rlang::is_missing(mat)) { return( - purrr::partial( - normalize_tfidf, - feature_means = feature_means, scale_factor = scale_factor, - threads = threads + partial_explicit( + normalize_tfidf, feature_means = feature_means, + scale_factor = scale_factor, threads = threads ) ) } diff --git a/r/R/utils.R b/r/R/utils.R index 4ea62d15..784f6106 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -56,4 +56,20 @@ log_progress <- function(msg, add_timestamp = TRUE){ } else { message(msg) } +} + +# Helper function to create partial explicit functions +# This builds upon purrr::partial by allowing for nested partial calls, where each partial call +# only does partial application of the arguments that were explicitly provided. +partial_explicit <- function(fn, ...) { + args <- rlang::enquos(...) + evaluated_args <- purrr::map(args, rlang::eval_tidy) + # Fetch the default arguments from the function definition + default_args <- formals(fn) + # Keep only explicitly provided arguments that were evaluated + # where the values are different from the default arguments + explicitly_passed_args <- evaluated_args[names(evaluated_args) %in% names(default_args) & + !purrr::map2_lgl(evaluated_args, default_args[names(evaluated_args)], identical)] + # Return a partially applied version of the function using evaluated arguments + return(purrr::partial(fn, !!!explicitly_passed_args)) } \ No newline at end of file From 067b5406f7f8fb26bfa98fd00470ef0fedf262fc Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 13 Jan 2025 21:19:46 -0800 Subject: [PATCH 12/14] [r] add partial calls, update feature selection docs --- r/R/singlecell_utils.R | 49 ++++++++------- r/man/feature_selection.Rd | 78 ++++++++++++++++++++++++ r/man/select_features_by_dispersion.Rd | 42 ------------- r/man/select_features_by_mean.Rd | 34 ----------- r/man/select_features_by_variance.Rd | 41 ------------- r/pkgdown/_pkgdown.yml | 4 +- r/tests/testthat/test-singlecell_utils.R | 8 ++- 7 files changed, 112 insertions(+), 144 deletions(-) create mode 100644 r/man/feature_selection.Rd delete mode 100644 r/man/select_features_by_dispersion.Rd delete mode 100644 r/man/select_features_by_mean.Rd delete mode 100644 r/man/select_features_by_variance.Rd diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 9a8993f7..40babbe3 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -11,34 +11,42 @@ # Feature selection ################# -#' Get the most variable features within a matrix. +#' Feature selection functions +#' +#' Apply a feature selection method to a `(features x cells)` matrix. +#' @rdname feature_selection #' @param mat (IterableMatrix) dimensions features x cells #' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, #' all features will be returned. #' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed. #' @param threads (integer) Number of threads to use. #' @returns -#' Return a dataframe with the following columns, sorted descending by variance: +#' Return a dataframe with the following columns, sorted descending by score: #' - `names`: Feature name. -#' - `score`: Variance of the feature. +#' - `score`: Scoring of the feature, depending on the method used. #' - `highly_variable`: Logical vector of whether the feature is highly variable. +#' +#' Each different feature selection method will have a different scoring method: +#' - `select_features_by_variance`: Score representing variance of each feature. #' @details -#' Calculate using the following process: +#' `select_features_by_variance` Calculates the variance of each feature using the following process: #' 1. Perform an optional term frequency + log normalization, for each feature. -#' 2. Find `num_feats` features with the highest variance across clusters. +#' 2. Find `num_feats` features with the highest variance. #' @export select_features_by_variance <- function( mat, num_feats = 25000, normalize = normalize_log, threads = 1L ) { + if (rlang::is_missing(mat)) { + return(purrr::partial(select_features_by_variance, num_feats = num_feats, normalize = normalize, threads = threads)) + } assert_is(mat, "IterableMatrix") assert_greater_than_zero(num_feats) assert_is_wholenumber(num_feats) assert_len(num_feats, 1) assert_is(num_feats, "numeric") num_feats <- min(max(num_feats, 0), nrow(mat)) - if (!is.null(normalize)) mat <- normalize(mat, threads = threads) features_df <- tibble::tibble( names = rownames(mat), @@ -50,15 +58,11 @@ select_features_by_variance <- function( } -#' Get the features with the highest dispersion within a matrix. +#' @rdname feature_selection #' @returns -#' Return a dataframe with the following columns, sorted descending by dispersion: -#' - `names`: Feature name. -#' - `score`: Variance of the feature. -#' - `highly_variable`: Logical vector of whether the feature is highly variable. -#' @inheritParams select_features_by_variance +#' - `select_features_by_dispersion`: Score representing the dispersion of each feature. #' @details -#' Calculate using the following process: +#' `select_features_by_dispersion` calculates the dispersion of each feature using the following process: #' 1. Perform an optional term frequency + log normalization, for each feature. #' 2. Find the dispersion (variance/mean) of each feature. #' 3. Find `num_feats` features with the highest dispersion. @@ -67,6 +71,9 @@ select_features_by_dispersion <- function( normalize = NULL, threads = 1L ) { + if (rlang::is_missing(mat)) { + return(partial_explicit(select_features_by_dispersion, num_feats = num_feats, normalize = normalize, threads = threads)) + } assert_is(mat, "IterableMatrix") assert_greater_than_zero(num_feats) assert_is_wholenumber(num_feats) @@ -86,21 +93,18 @@ select_features_by_dispersion <- function( } -#' Get the top features from a matrix, based on the mean accessibility of each feature. -#' @param num_feats Number of features to deem as highly accessible. If the number is higher than the number of features in the matrix, -#' all features will be returned. -#' @inheritParams select_features_by_variance +#' @rdname feature_selection #' @returns -#' Return a dataframe with the following columns, sorted descending by mean accessibility: -#' - `names`: Feature name. -#' - `score`: Binarize sum of each feature. -#' - `highly_variable`: Logical vector of whether the feature is highly accessible by mean accessibility. +#' - `select_features_by_mean`: Score representing the mean accessibility of each feature. #' @details -#' Calculate using the following process: +#' `select_features_by_mean` calculates the mean accessibility of each feature using the following process: #' 1. Get the sum of each binarized feature. #' 2. Find `num_feats` features with the highest accessibility. #' @export select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) { + if (rlang::is_missing(mat)) { + return(partial_explicit(select_features_by_mean, num_feats = num_feats, threads = threads)) + } assert_is(mat, "IterableMatrix") assert_is_wholenumber(num_feats) assert_greater_than_zero(num_feats) @@ -182,7 +186,6 @@ marker_features <- function(mat, groups, method="wilcoxon") { ) } - #' Aggregate counts matrices by cell group or feature. #' #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd new file mode 100644 index 00000000..c57a8e40 --- /dev/null +++ b/r/man/feature_selection.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{select_features_by_variance} +\alias{select_features_by_variance} +\alias{select_features_by_dispersion} +\alias{select_features_by_mean} +\title{Feature selection functions} +\usage{ +select_features_by_variance( + mat, + num_feats = 25000, + normalize = normalize_log, + threads = 1L +) + +select_features_by_dispersion( + mat, + num_feats = 25000, + normalize = NULL, + threads = 1L +) + +select_features_by_mean(mat, num_feats = 25000, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +Return a dataframe with the following columns, sorted descending by score: +\itemize{ +\item \code{names}: Feature name. +\item \code{score}: Scoring of the feature, depending on the method used. +\item \code{highly_variable}: Logical vector of whether the feature is highly variable. +} + +Each different feature selection method will have a different scoring method: +\itemize{ +\item \code{select_features_by_variance}: Score representing variance of each feature. +} + +\itemize{ +\item \code{select_features_by_dispersion}: Score representing the dispersion of each feature. +} + +\itemize{ +\item \code{select_features_by_mean}: Score representing the mean accessibility of each feature. +} +} +\description{ +Apply a feature selection method to a \verb{(features x cells)} matrix. +} +\details{ +\code{select_features_by_variance} Calculates the variance of each feature using the following process: +\enumerate{ +\item Perform an optional term frequency + log normalization, for each feature. +\item Find \code{num_feats} features with the highest variance. +} + +\code{select_features_by_dispersion} calculates the dispersion of each feature using the following process: +\enumerate{ +\item Perform an optional term frequency + log normalization, for each feature. +\item Find the dispersion (variance/mean) of each feature. +\item Find \code{num_feats} features with the highest dispersion. +} + +\code{select_features_by_mean} calculates the mean accessibility of each feature using the following process: +\enumerate{ +\item Get the sum of each binarized feature. +\item Find \code{num_feats} features with the highest accessibility. +} +} diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd deleted file mode 100644 index 2835c9a8..00000000 --- a/r/man/select_features_by_dispersion.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/singlecell_utils.R -\name{select_features_by_dispersion} -\alias{select_features_by_dispersion} -\title{Get the features with the highest dispersion within a matrix.} -\usage{ -select_features_by_dispersion( - mat, - num_feats = 25000, - normalize = NULL, - threads = 1L -) -} -\arguments{ -\item{mat}{(IterableMatrix) dimensions features x cells} - -\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, -all features will be returned.} - -\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} - -\item{threads}{(integer) Number of threads to use.} -} -\value{ -Return a dataframe with the following columns, sorted descending by dispersion: -\itemize{ -\item \code{names}: Feature name. -\item \code{score}: Variance of the feature. -\item \code{highly_variable}: Logical vector of whether the feature is highly variable. -} -} -\description{ -Get the features with the highest dispersion within a matrix. -} -\details{ -Calculate using the following process: -\enumerate{ -\item Perform an optional term frequency + log normalization, for each feature. -\item Find the dispersion (variance/mean) of each feature. -\item Find \code{num_feats} features with the highest dispersion. -} -} diff --git a/r/man/select_features_by_mean.Rd b/r/man/select_features_by_mean.Rd deleted file mode 100644 index c05b0acb..00000000 --- a/r/man/select_features_by_mean.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/singlecell_utils.R -\name{select_features_by_mean} -\alias{select_features_by_mean} -\title{Get the top features from a matrix, based on the mean accessibility of each feature.} -\usage{ -select_features_by_mean(mat, num_feats = 25000, threads = 1L) -} -\arguments{ -\item{mat}{(IterableMatrix) dimensions features x cells} - -\item{num_feats}{Number of features to deem as highly accessible. If the number is higher than the number of features in the matrix, -all features will be returned.} - -\item{threads}{(integer) Number of threads to use.} -} -\value{ -Return a dataframe with the following columns, sorted descending by mean accessibility: -\itemize{ -\item \code{names}: Feature name. -\item \code{score}: Binarize sum of each feature. -\item \code{highly_variable}: Logical vector of whether the feature is highly accessible by mean accessibility. -} -} -\description{ -Get the top features from a matrix, based on the mean accessibility of each feature. -} -\details{ -Calculate using the following process: -\enumerate{ -\item Get the sum of each binarized feature. -\item Find \code{num_feats} features with the highest accessibility. -} -} diff --git a/r/man/select_features_by_variance.Rd b/r/man/select_features_by_variance.Rd deleted file mode 100644 index b7cc375f..00000000 --- a/r/man/select_features_by_variance.Rd +++ /dev/null @@ -1,41 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/singlecell_utils.R -\name{select_features_by_variance} -\alias{select_features_by_variance} -\title{Get the most variable features within a matrix.} -\usage{ -select_features_by_variance( - mat, - num_feats = 25000, - normalize = normalize_log, - threads = 1L -) -} -\arguments{ -\item{mat}{(IterableMatrix) dimensions features x cells} - -\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, -all features will be returned.} - -\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} - -\item{threads}{(integer) Number of threads to use.} -} -\value{ -Return a dataframe with the following columns, sorted descending by variance: -\itemize{ -\item \code{names}: Feature name. -\item \code{score}: Variance of the feature. -\item \code{highly_variable}: Logical vector of whether the feature is highly variable. -} -} -\description{ -Get the most variable features within a matrix. -} -\details{ -Calculate using the following process: -\enumerate{ -\item Perform an optional term frequency + log normalization, for each feature. -\item Find \code{num_feats} features with the highest variance across clusters. -} -} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 70b77d96..0bf475aa 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -132,9 +132,6 @@ reference: - checksum - apply_by_row - regress_out - - select_features_by_variance - - select_features_by_dispersion - - select_features_by_mean - IterableMatrix-methods - pseudobulk_matrix @@ -142,6 +139,7 @@ reference: - subtitle: "Dimensionality reduction" - contents: - normalize_log + - select_features_by_variance - subtitle: "Clustering" - contents: - knn_hnsw diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 2b843945..efefc070 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -21,11 +21,17 @@ test_that("select_features works general case", { expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable expect_setequal(res$names, rownames(m1)) res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows - res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) + res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) + res_feats_partial <- get(fn)(num_feats = 100)(m1) + expect_identical(res_feats_equal_rows, res_feats_partial) expect_identical(res_more_feats_than_rows, res_feats_equal_rows) if (fn == "select_features_by_variance") { # Check that normalization actually does something res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL)) + # Check that we can do partial functions on normalization too + res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = purrr::partial(normalize_log(scale = 1e3, threads = 1L)))) + res_norm_implicit_partial <- select_features_by_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1) + expect_identical(res_norm_partial, res_norm_implicit_partial) expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score)) } } From 16d534405b2fd0264e090c7b5beaada3b818acd8 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 24 Jan 2025 14:27:03 -0800 Subject: [PATCH 13/14] [r] reorder assertions, add new partial func system --- r/R/singlecell_utils.R | 51 +++++++++++----- r/R/transforms.R | 25 ++++---- r/R/utils.R | 75 +++++++++++++++++++----- r/tests/testthat/test-singlecell_utils.R | 2 +- 4 files changed, 112 insertions(+), 41 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 40babbe3..636591ad 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -38,16 +38,22 @@ select_features_by_variance <- function( normalize = normalize_log, threads = 1L ) { - if (rlang::is_missing(mat)) { - return(purrr::partial(select_features_by_variance, num_feats = num_feats, normalize = normalize, threads = threads)) - } - assert_is(mat, "IterableMatrix") assert_greater_than_zero(num_feats) assert_is_wholenumber(num_feats) assert_len(num_feats, 1) assert_is(num_feats, "numeric") + if (rlang::is_missing(mat)) { + return(create_partial( + missing_args = list( + num_feats = missing(num_feats), + normalize = missing(normalize), + threads = missing(threads) + ) + )) + } + assert_is(mat, "IterableMatrix") num_feats <- min(max(num_feats, 0), nrow(mat)) - if (!is.null(normalize)) mat <- normalize(mat, threads = threads) + if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) features_df <- tibble::tibble( names = rownames(mat), score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",] @@ -71,17 +77,22 @@ select_features_by_dispersion <- function( normalize = NULL, threads = 1L ) { - if (rlang::is_missing(mat)) { - return(partial_explicit(select_features_by_dispersion, num_feats = num_feats, normalize = normalize, threads = threads)) - } - assert_is(mat, "IterableMatrix") assert_greater_than_zero(num_feats) assert_is_wholenumber(num_feats) assert_len(num_feats, 1) assert_is(num_feats, "numeric") + if (rlang::is_missing(mat)) { + return(create_partial( + missing_args = list( + num_feats = missing(num_feats), + normalize = missing(normalize), + threads = missing(threads) + ) + )) + } num_feats <- min(max(num_feats, 0), nrow(mat)) - - if (!is.null(normalize)) mat <- normalize(mat, threads = threads) + assert_is(mat, "IterableMatrix") + if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads) features_df <- tibble::tibble( names = rownames(mat), @@ -101,17 +112,25 @@ select_features_by_dispersion <- function( #' 1. Get the sum of each binarized feature. #' 2. Find `num_feats` features with the highest accessibility. #' @export -select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) { - if (rlang::is_missing(mat)) { - return(partial_explicit(select_features_by_mean, num_feats = num_feats, threads = threads)) - } - assert_is(mat, "IterableMatrix") +select_features_by_mean <- function(mat, num_feats = 25000, normalize = NULL, threads = 1L) { assert_is_wholenumber(num_feats) assert_greater_than_zero(num_feats) assert_is(num_feats, "numeric") + if (rlang::is_missing(mat)) { + return(create_partial( + missing_args = list( + num_feats = missing(num_feats), + normalize = missing(normalize), + threads = missing(threads) + ) + )) + } + assert_is(mat, "IterableMatrix") num_feats <- min(max(num_feats, 0), nrow(mat)) + if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) # get the sum of each feature, binarized # get the top features + features_df <- tibble::tibble( names = rownames(mat), score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ] diff --git a/r/R/transforms.R b/r/R/transforms.R index 02ab51b8..3ce8ac56 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -943,17 +943,19 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog` #' @export normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { - # browser() + assert_is_numeric(scale_factor) + assert_greater_than_zero(scale_factor) if (rlang::is_missing(mat)) { return( - partial_explicit( - normalize_log, scale_factor = scale_factor, threads = threads + create_partial( + missing_args = list( + scale_factor = missing(scale_factor), + threads = missing(threads) + ) ) - ) + ) } assert_is(mat, "IterableMatrix") - assert_is_numeric(scale_factor) - assert_greater_than_zero(scale_factor) read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) mat <- mat %>% multiply_cols(1 / read_depth) return(log1p(mat * scale_factor)) @@ -971,16 +973,19 @@ normalize_tfidf <- function( mat, feature_means = NULL, scale_factor = 1e4, threads = 1L ) { + assert_is_wholenumber(threads) if (rlang::is_missing(mat)) { return( - partial_explicit( - normalize_tfidf, feature_means = feature_means, - scale_factor = scale_factor, threads = threads + create_partial( + missing_args = list( + feature_means = missing(feature_means), + scale_factor = missing(scale_factor), + threads = missing(threads) + ) ) ) } assert_is(mat, "IterableMatrix") - assert_is_wholenumber(threads) # If feature means are passed in, only need to calculate term frequency if (is.null(feature_means)) { mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) diff --git a/r/R/utils.R b/r/R/utils.R index 784f6106..d9f7b145 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -58,18 +58,65 @@ log_progress <- function(msg, add_timestamp = TRUE){ } } -# Helper function to create partial explicit functions -# This builds upon purrr::partial by allowing for nested partial calls, where each partial call -# only does partial application of the arguments that were explicitly provided. -partial_explicit <- function(fn, ...) { - args <- rlang::enquos(...) - evaluated_args <- purrr::map(args, rlang::eval_tidy) - # Fetch the default arguments from the function definition - default_args <- formals(fn) - # Keep only explicitly provided arguments that were evaluated - # where the values are different from the default arguments - explicitly_passed_args <- evaluated_args[names(evaluated_args) %in% names(default_args) & - !purrr::map2_lgl(evaluated_args, default_args[names(evaluated_args)], identical)] - # Return a partially applied version of the function using evaluated arguments - return(purrr::partial(fn, !!!explicitly_passed_args)) +#' Helper to create partial functions +#' +#' Automatically creates a partial application of the caller +#' function including all non-missing arguments. +#' +#' @param missing_args (named list[bool]) Any named index with a TRUE value +#' will be treated as missing. Designed to be used in the caller with the +#' `base::missing()` function to detect unspecified arguments with default values, +#' or to manually specifiy other arguments that should not be specialized +#' @return A `bpcells_partial` object (a function with some extra attributes) +create_partial <- function(missing_args=list()) { + env <- rlang::caller_env() + fn_sym <- rlang::caller_call()[[1]] + fn <- rlang::caller_fn() + + args <- list() + for (n in names(formals(fn))) { + if (n %in% names(missing_args) && missing_args[[n]]) next + if (rlang::is_missing(env[[n]])) next + args[[n]] <- env[[n]] + } + + ret <- do.call(partial_apply, c(fn, args)) + attr(ret, "body")[[1]] <- fn_sym + return(ret) +} + + +#' Create partial function calls +#' +#' Specify some but not all arguments to a function. +#' +#' @param f A function +#' @param ... Named arguments to `f` +#' @param .overwrite (bool) If `f` is already an ouptut from +#' `partial_apply()`, whether parameter re-definitions should +#' be ignored or overwrite the existing definitions +#' @return A `bpcells_partial` object (a function with some extra attributes) +partial_apply <- function(f, ..., .overwrite=FALSE) { + args <- rlang::list2(...) + + if (is(f, "bpcells_partial")) { + prev_args <- attr(f, "args") + for (a in names(prev_args)) { + if (!(.overwrite && a %in% names(args))) { + args[[a]] <- prev_args[[a]] + } + } + f <- attr(f, "fn") + function_name <- attr(f, "body")[[1]] + } else { + function_name <- rlang::sym(rlang::caller_arg(f)) + } + partial_fn <- do.call(purrr::partial, c(f, args)) + attr(partial_fn, "body")[[1]] <- function_name + structure( + partial_fn, + class = c("bpcells_partial", "purrr_function_partial", "function"), + args = args, + fn = f + ) } \ No newline at end of file diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index efefc070..e9365ced 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -29,7 +29,7 @@ test_that("select_features works general case", { # Check that normalization actually does something res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL)) # Check that we can do partial functions on normalization too - res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = purrr::partial(normalize_log(scale = 1e3, threads = 1L)))) + res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = normalize_log(scale = 1e3, threads = 1L))) res_norm_implicit_partial <- select_features_by_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1) expect_identical(res_norm_partial, res_norm_implicit_partial) expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score)) From 87eb430b1c51b096982d785dbc3a37cdabba516e Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 24 Jan 2025 14:45:29 -0800 Subject: [PATCH 14/14] [r] change behaviour of num_feats default args, write docs --- r/NAMESPACE | 4 ++-- r/R/singlecell_utils.R | 19 +++++++++++------- r/R/utils.R | 4 +++- r/man/create_partial.Rd | 22 +++++++++++++++++++++ r/man/feature_selection.Rd | 25 +++++++++++------------- r/man/partial_apply.Rd | 24 +++++++++++++++++++++++ r/pkgdown/_pkgdown.yml | 2 +- r/tests/testthat/test-singlecell_utils.R | 12 ++++++------ 8 files changed, 81 insertions(+), 31 deletions(-) create mode 100644 r/man/create_partial.Rd create mode 100644 r/man/partial_apply.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index deb92dfa..e9998384 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -110,8 +110,8 @@ export(rowVars.default) export(sctransform_pearson) export(select_cells) export(select_chromosomes) -export(select_features_by_mean) -export(select_features_by_variance) +export(select_features_mean) +export(select_features_variance) export(select_regions) export(set_trackplot_height) export(set_trackplot_label) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 636591ad..37f61d98 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -16,7 +16,9 @@ #' Apply a feature selection method to a `(features x cells)` matrix. #' @rdname feature_selection #' @param mat (IterableMatrix) dimensions features x cells -#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, +#' @param num_feats (float) Number of features to return. If the number given is between 0 and 1, treat as a proportion of +#' the number of rows, rounded down. Otherwise, treat as an absolute number. +#' If the number is higher than the number of features in the matrix, #' all features will be returned. #' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed. #' @param threads (integer) Number of threads to use. @@ -33,9 +35,9 @@ #' 1. Perform an optional term frequency + log normalization, for each feature. #' 2. Find `num_feats` features with the highest variance. #' @export -select_features_by_variance <- function( - mat, num_feats = 25000, - normalize = normalize_log, +select_features_variance <- function( + mat, num_feats = 0.05, + normalize = NULL, threads = 1L ) { assert_greater_than_zero(num_feats) @@ -53,6 +55,7 @@ select_features_by_variance <- function( } assert_is(mat, "IterableMatrix") num_feats <- min(max(num_feats, 0), nrow(mat)) + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) features_df <- tibble::tibble( names = rownames(mat), @@ -72,8 +75,8 @@ select_features_by_variance <- function( #' 1. Perform an optional term frequency + log normalization, for each feature. #' 2. Find the dispersion (variance/mean) of each feature. #' 3. Find `num_feats` features with the highest dispersion. -select_features_by_dispersion <- function( - mat, num_feats = 25000, +select_features_dispersion <- function( + mat, num_feats = 0.05, normalize = NULL, threads = 1L ) { @@ -91,6 +94,7 @@ select_features_by_dispersion <- function( )) } num_feats <- min(max(num_feats, 0), nrow(mat)) + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) assert_is(mat, "IterableMatrix") if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads) @@ -112,7 +116,7 @@ select_features_by_dispersion <- function( #' 1. Get the sum of each binarized feature. #' 2. Find `num_feats` features with the highest accessibility. #' @export -select_features_by_mean <- function(mat, num_feats = 25000, normalize = NULL, threads = 1L) { +select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) { assert_is_wholenumber(num_feats) assert_greater_than_zero(num_feats) assert_is(num_feats, "numeric") @@ -127,6 +131,7 @@ select_features_by_mean <- function(mat, num_feats = 25000, normalize = NULL, th } assert_is(mat, "IterableMatrix") num_feats <- min(max(num_feats, 0), nrow(mat)) + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) # get the sum of each feature, binarized # get the top features diff --git a/r/R/utils.R b/r/R/utils.R index d9f7b145..d7c65358 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -68,6 +68,7 @@ log_progress <- function(msg, add_timestamp = TRUE){ #' `base::missing()` function to detect unspecified arguments with default values, #' or to manually specifiy other arguments that should not be specialized #' @return A `bpcells_partial` object (a function with some extra attributes) +#' @keywords internal create_partial <- function(missing_args=list()) { env <- rlang::caller_env() fn_sym <- rlang::caller_call()[[1]] @@ -92,10 +93,11 @@ create_partial <- function(missing_args=list()) { #' #' @param f A function #' @param ... Named arguments to `f` -#' @param .overwrite (bool) If `f` is already an ouptut from +#' @param .overwrite (bool) If `f` is already an output from #' `partial_apply()`, whether parameter re-definitions should #' be ignored or overwrite the existing definitions #' @return A `bpcells_partial` object (a function with some extra attributes) +#' @keywords internal partial_apply <- function(f, ..., .overwrite=FALSE) { args <- rlang::list2(...) diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd new file mode 100644 index 00000000..fe92fd5a --- /dev/null +++ b/r/man/create_partial.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{create_partial} +\alias{create_partial} +\title{Helper to create partial functions} +\usage{ +create_partial(missing_args = list()) +} +\arguments{ +\item{missing_args}{(named list\link{bool}) Any named index with a TRUE value +will be treated as missing. Designed to be used in the caller with the +\code{base::missing()} function to detect unspecified arguments with default values, +or to manually specifiy other arguments that should not be specialized} +} +\value{ +A \code{bpcells_partial} object (a function with some extra attributes) +} +\description{ +Automatically creates a partial application of the caller +function including all non-missing arguments. +} +\keyword{internal} diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd index c57a8e40..f83e2199 100644 --- a/r/man/feature_selection.Rd +++ b/r/man/feature_selection.Rd @@ -1,31 +1,28 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/singlecell_utils.R -\name{select_features_by_variance} -\alias{select_features_by_variance} -\alias{select_features_by_dispersion} -\alias{select_features_by_mean} +\name{select_features_variance} +\alias{select_features_variance} +\alias{select_features_dispersion} +\alias{select_features_mean} \title{Feature selection functions} \usage{ -select_features_by_variance( - mat, - num_feats = 25000, - normalize = normalize_log, - threads = 1L -) +select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L) -select_features_by_dispersion( +select_features_dispersion( mat, - num_feats = 25000, + num_feats = 0.05, normalize = NULL, threads = 1L ) -select_features_by_mean(mat, num_feats = 25000, threads = 1L) +select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L) } \arguments{ \item{mat}{(IterableMatrix) dimensions features x cells} -\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +\item{num_feats}{(float) Number of features to return. If the number given is between 0 and 1, treat as a proportion of +the number of rows, rounded down. Otherwise, treat as an absolute number. +If the number is higher than the number of features in the matrix, all features will be returned.} \item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd new file mode 100644 index 00000000..c1ea508d --- /dev/null +++ b/r/man/partial_apply.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{partial_apply} +\alias{partial_apply} +\title{Create partial function calls} +\usage{ +partial_apply(f, ..., .overwrite = FALSE) +} +\arguments{ +\item{f}{A function} + +\item{...}{Named arguments to \code{f}} + +\item{.overwrite}{(bool) If \code{f} is already an output from +\code{partial_apply()}, whether parameter re-definitions should +be ignored or overwrite the existing definitions} +} +\value{ +A \code{bpcells_partial} object (a function with some extra attributes) +} +\description{ +Specify some but not all arguments to a function. +} +\keyword{internal} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 0bf475aa..0d98b13d 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -139,7 +139,7 @@ reference: - subtitle: "Dimensionality reduction" - contents: - normalize_log - - select_features_by_variance + - select_features_variance - subtitle: "Clustering" - contents: - knn_hnsw diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index e9365ced..08e41b7c 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -15,24 +15,24 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = test_that("select_features works general case", { m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix") - for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) { - res <- do.call(fn, list(m1, num_feats = 10)) + for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) { + res <- do.call(fn, list(m1, num_feats = 5)) expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting - expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable + expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable expect_setequal(res$names, rownames(m1)) res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) res_feats_partial <- get(fn)(num_feats = 100)(m1) expect_identical(res_feats_equal_rows, res_feats_partial) expect_identical(res_more_feats_than_rows, res_feats_equal_rows) - if (fn == "select_features_by_variance") { + if (fn == "select_features_variance") { # Check that normalization actually does something res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL)) # Check that we can do partial functions on normalization too res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = normalize_log(scale = 1e3, threads = 1L))) - res_norm_implicit_partial <- select_features_by_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1) + res_norm_implicit_partial <- select_features_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1) expect_identical(res_norm_partial, res_norm_implicit_partial) - expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score)) + expect_true(!all((res_no_norm %>% dplyr::arrange(names))$score == (res_norm_partial %>% dplyr::arrange(names))$score)) } } })