diff --git a/r/NAMESPACE b/r/NAMESPACE
index 982ca440..e9998384 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -65,6 +65,8 @@ export(min_by_row)
export(min_scalar)
export(multiply_cols)
export(multiply_rows)
+export(normalize_log)
+export(normalize_tfidf)
export(nucleosome_counts)
export(open_fragments_10x)
export(open_fragments_dir)
@@ -108,6 +110,8 @@ export(rowVars.default)
export(sctransform_pearson)
export(select_cells)
export(select_chromosomes)
+export(select_features_mean)
+export(select_features_variance)
export(select_regions)
export(set_trackplot_height)
export(set_trackplot_label)
diff --git a/r/NEWS.md b/r/NEWS.md
index 6cab91a5..dc220204 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -10,6 +10,7 @@ Contributions welcome :)
## Features
- Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
+- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
## Bug-fixes
- Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175)
@@ -39,6 +40,8 @@ of the new features this release and will continue to help with maintenance and
- Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
- Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
+- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
+- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
## Improvements
- `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 7267e19b..37f61d98 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -6,6 +6,146 @@
# option. This file may not be copied, modified, or distributed
# except according to those terms.
+
+#################
+# Feature selection
+#################
+
+#' Feature selection functions
+#'
+#' Apply a feature selection method to a `(features x cells)` matrix.
+#' @rdname feature_selection
+#' @param mat (IterableMatrix) dimensions features x cells
+#' @param num_feats (float) Number of features to return. If the number given is between 0 and 1, treat as a proportion of
+#' the number of rows, rounded down. Otherwise, treat as an absolute number.
+#' If the number is higher than the number of features in the matrix,
+#' all features will be returned.
+#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
+#' @param threads (integer) Number of threads to use.
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by score:
+#' - `names`: Feature name.
+#' - `score`: Scoring of the feature, depending on the method used.
+#' - `highly_variable`: Logical vector of whether the feature is highly variable.
+#'
+#' Each different feature selection method will have a different scoring method:
+#' - `select_features_by_variance`: Score representing variance of each feature.
+#' @details
+#' `select_features_by_variance` Calculates the variance of each feature using the following process:
+#' 1. Perform an optional term frequency + log normalization, for each feature.
+#' 2. Find `num_feats` features with the highest variance.
+#' @export
+select_features_variance <- function(
+ mat, num_feats = 0.05,
+ normalize = NULL,
+ threads = 1L
+) {
+ assert_greater_than_zero(num_feats)
+ assert_is_wholenumber(num_feats)
+ assert_len(num_feats, 1)
+ assert_is(num_feats, "numeric")
+ if (rlang::is_missing(mat)) {
+ return(create_partial(
+ missing_args = list(
+ num_feats = missing(num_feats),
+ normalize = missing(normalize),
+ threads = missing(threads)
+ )
+ ))
+ }
+ assert_is(mat, "IterableMatrix")
+ num_feats <- min(max(num_feats, 0), nrow(mat))
+ if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+ if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+ features_df <- tibble::tibble(
+ names = rownames(mat),
+ score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
+ ) %>%
+ dplyr::arrange(desc(score)) %>%
+ dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+ return(features_df)
+}
+
+
+#' @rdname feature_selection
+#' @returns
+#' - `select_features_by_dispersion`: Score representing the dispersion of each feature.
+#' @details
+#' `select_features_by_dispersion` calculates the dispersion of each feature using the following process:
+#' 1. Perform an optional term frequency + log normalization, for each feature.
+#' 2. Find the dispersion (variance/mean) of each feature.
+#' 3. Find `num_feats` features with the highest dispersion.
+select_features_dispersion <- function(
+ mat, num_feats = 0.05,
+ normalize = NULL,
+ threads = 1L
+) {
+ assert_greater_than_zero(num_feats)
+ assert_is_wholenumber(num_feats)
+ assert_len(num_feats, 1)
+ assert_is(num_feats, "numeric")
+ if (rlang::is_missing(mat)) {
+ return(create_partial(
+ missing_args = list(
+ num_feats = missing(num_feats),
+ normalize = missing(normalize),
+ threads = missing(threads)
+ )
+ ))
+ }
+ num_feats <- min(max(num_feats, 0), nrow(mat))
+ if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+ assert_is(mat, "IterableMatrix")
+ if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+ mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
+ features_df <- tibble::tibble(
+ names = rownames(mat),
+ score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ]
+ ) %>%
+ dplyr::arrange(desc(score)) %>%
+ dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+ return(features_df)
+}
+
+
+#' @rdname feature_selection
+#' @returns
+#' - `select_features_by_mean`: Score representing the mean accessibility of each feature.
+#' @details
+#' `select_features_by_mean` calculates the mean accessibility of each feature using the following process:
+#' 1. Get the sum of each binarized feature.
+#' 2. Find `num_feats` features with the highest accessibility.
+#' @export
+select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) {
+ assert_is_wholenumber(num_feats)
+ assert_greater_than_zero(num_feats)
+ assert_is(num_feats, "numeric")
+ if (rlang::is_missing(mat)) {
+ return(create_partial(
+ missing_args = list(
+ num_feats = missing(num_feats),
+ normalize = missing(normalize),
+ threads = missing(threads)
+ )
+ ))
+ }
+ assert_is(mat, "IterableMatrix")
+ num_feats <- min(max(num_feats, 0), nrow(mat))
+ if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+ if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+ # get the sum of each feature, binarized
+ # get the top features
+
+ features_df <- tibble::tibble(
+ names = rownames(mat),
+ score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
+ ) %>%
+ dplyr::arrange(desc(score)) %>%
+ dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+ return(features_df)
+}
+
+
#' Test for marker features
#'
#' Given a features x cells matrix, perform one-vs-all differential
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 2a5de994..3ce8ac56 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -923,3 +923,89 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
vars_to_regress = vars_to_regress
)
}
+
+#################
+# Normalizations
+#################
+
+#' Normalization helper functions
+#'
+#' Apply standard normalizations to a `(features x cells)` counts matrix.
+#'
+#' @rdname normalize
+#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)`
+#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
+#' @param threads (integer) Number of threads to use.
+#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+#' transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+#'
+#' - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
+#' @export
+normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
+ assert_is_numeric(scale_factor)
+ assert_greater_than_zero(scale_factor)
+ if (rlang::is_missing(mat)) {
+ return(
+ create_partial(
+ missing_args = list(
+ scale_factor = missing(scale_factor),
+ threads = missing(threads)
+ )
+ )
+ )
+ }
+ assert_is(mat, "IterableMatrix")
+ read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+ mat <- mat %>% multiply_cols(1 / read_depth)
+ return(log1p(mat * scale_factor))
+}
+
+
+#' @rdname normalize
+#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then
+#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+#' Else, map each feature name to its mean value.
+#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()`
+#' @export
+normalize_tfidf <- function(
+ mat, feature_means = NULL,
+ scale_factor = 1e4, threads = 1L
+) {
+ assert_is_wholenumber(threads)
+ if (rlang::is_missing(mat)) {
+ return(
+ create_partial(
+ missing_args = list(
+ feature_means = missing(feature_means),
+ scale_factor = missing(scale_factor),
+ threads = missing(threads)
+ )
+ )
+ )
+ }
+ assert_is(mat, "IterableMatrix")
+ # If feature means are passed in, only need to calculate term frequency
+ if (is.null(feature_means)) {
+ mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+ feature_means <- mat_stats$row_stats["mean", ]
+ read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+ } else {
+ assert_is_numeric(feature_means)
+ if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
+ # Make sure every name in feature means exists in rownames(mat)
+ # In the case there is a length mismatch but the feature names all exist in feature_means
+ # will not error out
+ assert_true(all(rownames(mat) %in% names(feature_means)))
+ feature_means <- feature_means[rownames(mat)]
+ } else {
+ assert_len(feature_means, nrow(mat))
+ }
+ read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+ }
+ tf <- mat %>% multiply_cols(1 / read_depth)
+ idf <- 1 / feature_means
+ tf_idf_mat <- tf %>% multiply_rows(idf)
+ return(log1p(tf_idf_mat * scale_factor))
+}
\ No newline at end of file
diff --git a/r/R/utils.R b/r/R/utils.R
index 4ea62d15..d7c65358 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -56,4 +56,69 @@ log_progress <- function(msg, add_timestamp = TRUE){
} else {
message(msg)
}
+}
+
+#' Helper to create partial functions
+#'
+#' Automatically creates a partial application of the caller
+#' function including all non-missing arguments.
+#'
+#' @param missing_args (named list[bool]) Any named index with a TRUE value
+#' will be treated as missing. Designed to be used in the caller with the
+#' `base::missing()` function to detect unspecified arguments with default values,
+#' or to manually specifiy other arguments that should not be specialized
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
+create_partial <- function(missing_args=list()) {
+ env <- rlang::caller_env()
+ fn_sym <- rlang::caller_call()[[1]]
+ fn <- rlang::caller_fn()
+
+ args <- list()
+ for (n in names(formals(fn))) {
+ if (n %in% names(missing_args) && missing_args[[n]]) next
+ if (rlang::is_missing(env[[n]])) next
+ args[[n]] <- env[[n]]
+ }
+
+ ret <- do.call(partial_apply, c(fn, args))
+ attr(ret, "body")[[1]] <- fn_sym
+ return(ret)
+}
+
+
+#' Create partial function calls
+#'
+#' Specify some but not all arguments to a function.
+#'
+#' @param f A function
+#' @param ... Named arguments to `f`
+#' @param .overwrite (bool) If `f` is already an output from
+#' `partial_apply()`, whether parameter re-definitions should
+#' be ignored or overwrite the existing definitions
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
+partial_apply <- function(f, ..., .overwrite=FALSE) {
+ args <- rlang::list2(...)
+
+ if (is(f, "bpcells_partial")) {
+ prev_args <- attr(f, "args")
+ for (a in names(prev_args)) {
+ if (!(.overwrite && a %in% names(args))) {
+ args[[a]] <- prev_args[[a]]
+ }
+ }
+ f <- attr(f, "fn")
+ function_name <- attr(f, "body")[[1]]
+ } else {
+ function_name <- rlang::sym(rlang::caller_arg(f))
+ }
+ partial_fn <- do.call(purrr::partial, c(f, args))
+ attr(partial_fn, "body")[[1]] <- function_name
+ structure(
+ partial_fn,
+ class = c("bpcells_partial", "purrr_function_partial", "function"),
+ args = args,
+ fn = f
+ )
}
\ No newline at end of file
diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd
new file mode 100644
index 00000000..fe92fd5a
--- /dev/null
+++ b/r/man/create_partial.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{create_partial}
+\alias{create_partial}
+\title{Helper to create partial functions}
+\usage{
+create_partial(missing_args = list())
+}
+\arguments{
+\item{missing_args}{(named list\link{bool}) Any named index with a TRUE value
+will be treated as missing. Designed to be used in the caller with the
+\code{base::missing()} function to detect unspecified arguments with default values,
+or to manually specifiy other arguments that should not be specialized}
+}
+\value{
+A \code{bpcells_partial} object (a function with some extra attributes)
+}
+\description{
+Automatically creates a partial application of the caller
+function including all non-missing arguments.
+}
+\keyword{internal}
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
new file mode 100644
index 00000000..f83e2199
--- /dev/null
+++ b/r/man/feature_selection.Rd
@@ -0,0 +1,75 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_variance}
+\alias{select_features_variance}
+\alias{select_features_dispersion}
+\alias{select_features_mean}
+\title{Feature selection functions}
+\usage{
+select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
+
+select_features_dispersion(
+ mat,
+ num_feats = 0.05,
+ normalize = NULL,
+ threads = 1L
+)
+
+select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(float) Number of features to return. If the number given is between 0 and 1, treat as a proportion of
+the number of rows, rounded down. Otherwise, treat as an absolute number.
+If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by score:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Scoring of the feature, depending on the method used.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
+}
+
+Each different feature selection method will have a different scoring method:
+\itemize{
+\item \code{select_features_by_variance}: Score representing variance of each feature.
+}
+
+\itemize{
+\item \code{select_features_by_dispersion}: Score representing the dispersion of each feature.
+}
+
+\itemize{
+\item \code{select_features_by_mean}: Score representing the mean accessibility of each feature.
+}
+}
+\description{
+Apply a feature selection method to a \verb{(features x cells)} matrix.
+}
+\details{
+\code{select_features_by_variance} Calculates the variance of each feature using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find \code{num_feats} features with the highest variance.
+}
+
+\code{select_features_by_dispersion} calculates the dispersion of each feature using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find the dispersion (variance/mean) of each feature.
+\item Find \code{num_feats} features with the highest dispersion.
+}
+
+\code{select_features_by_mean} calculates the mean accessibility of each feature using the following process:
+\enumerate{
+\item Get the sum of each binarized feature.
+\item Find \code{num_feats} features with the highest accessibility.
+}
+}
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
new file mode 100644
index 00000000..ac53f2c0
--- /dev/null
+++ b/r/man/normalize.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transforms.R
+\name{normalize_log}
+\alias{normalize_log}
+\alias{normalize_tfidf}
+\title{Normalization helper functions}
+\usage{
+normalize_log(mat, scale_factor = 10000, threads = 1L)
+
+normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Counts matrix to normalize. \verb{(features x cells)}}
+
+\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
+
+\item{threads}{(integer) Number of threads to use.}
+
+\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then
+each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+Else, map each feature name to its mean value.}
+}
+\value{
+For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+\itemize{
+\item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+}
+
+\itemize{
+\item \code{normalize_tfidf}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+}
+}
+\description{
+Apply standard normalizations to a \verb{(features x cells)} counts matrix.
+}
+\details{
+\itemize{
+\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeLog}
+}
+
+\itemize{
+\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, used by default in \code{ArchR::addIterativeLSI()} and \code{Signac::RunTFIDF()}
+}
+}
diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd
new file mode 100644
index 00000000..c1ea508d
--- /dev/null
+++ b/r/man/partial_apply.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{partial_apply}
+\alias{partial_apply}
+\title{Create partial function calls}
+\usage{
+partial_apply(f, ..., .overwrite = FALSE)
+}
+\arguments{
+\item{f}{A function}
+
+\item{...}{Named arguments to \code{f}}
+
+\item{.overwrite}{(bool) If \code{f} is already an output from
+\code{partial_apply()}, whether parameter re-definitions should
+be ignored or overwrite the existing definitions}
+}
+\value{
+A \code{bpcells_partial} object (a function with some extra attributes)
+}
+\description{
+Specify some but not all arguments to a function.
+}
+\keyword{internal}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 64d16f01..0d98b13d 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -9,6 +9,11 @@ template:
includes:
in_header: |
+
+
+
+
+
after_body: |
@@ -130,15 +135,12 @@ reference:
- IterableMatrix-methods
- pseudobulk_matrix
-- title: "Reference Annotations"
+- title: "Single-cell analysis helpers"
+- subtitle: "Dimensionality reduction"
- contents:
- - human_gene_mapping
- - match_gene_symbol
- - read_gtf
- - read_bed
- - read_ucsc_chrom_sizes
-
-- title: "Clustering"
+ - normalize_log
+ - select_features_variance
+- subtitle: "Clustering"
- contents:
- knn_hnsw
- cluster_graph_leiden
@@ -176,3 +178,11 @@ reference:
- discrete_palette
- collect_features
- rotate_x_labels
+
+- title: "Reference Annotations"
+- contents:
+ - human_gene_mapping
+ - match_gene_symbol
+ - read_gtf
+ - read_bed
+ - read_ucsc_chrom_sizes
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index 385605e0..24cd9c23 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -346,3 +346,48 @@ test_that("linear regression works", {
expect_equal(as(m1, "matrix"), ans)
expect_equal(as(m1t, "matrix"), ans)
})
+
+test_that("tf-idf normalization works", {
+ m <- generate_sparse_matrix(5, 5)
+ rownames(m) <- paste0("row", seq_len(nrow(m)))
+ rev_rownames <- rev(rownames(m))
+ # Create tf-idf normalization for dgCMatrix
+ res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4)
+
+ rownames(res_dgc) <- rownames(m)
+ m2 <- as(m, "IterableMatrix")
+ # Check that we can pass in row means as a (named) vector
+ row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",]
+ # Test that row means ordering does not matter as long as names exist
+ row_means_shuffled <- row_means[sample(1:length(row_means))]
+ # Test that row means can have an extra element as long as all rownames are in the vector
+ row_means_plus_one <- c(row_means, row6 = 1)
+
+
+ res <- normalize_tfidf(m2)
+ expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6)
+ res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
+ res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2)
+ expect_equal(res_with_row_means, res_with_row_means_partial)
+ expect_identical(res, res_with_row_means)
+
+ res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
+ expect_identical(res_with_row_means, res_with_shuffled_row_means, res)
+
+ res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one)
+ expect_identical(res, res_with_row_means_with_extra_element)
+})
+
+test_that("normalize_log works", {
+ m <- generate_sparse_matrix(5, 5)
+ res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix")
+ m2 <- as(m, "IterableMatrix")
+ # Test that default params yield the same as log1p on dgCMatrix
+ res_1 <- as(normalize_log(m2), "dgCMatrix")
+ expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6)
+
+ # Test that changing scale factor works
+ res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
+ res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix")
+ expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6)
+})
\ No newline at end of file
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 85ad0be3..08e41b7c 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -13,6 +13,30 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val =
as(m, "dgCMatrix")
}
+test_that("select_features works general case", {
+ m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
+ for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) {
+ res <- do.call(fn, list(m1, num_feats = 5))
+ expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
+ expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable
+ expect_setequal(res$names, rownames(m1))
+ res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
+ res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100))
+ res_feats_partial <- get(fn)(num_feats = 100)(m1)
+ expect_identical(res_feats_equal_rows, res_feats_partial)
+ expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
+ if (fn == "select_features_variance") {
+ # Check that normalization actually does something
+ res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
+ # Check that we can do partial functions on normalization too
+ res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = normalize_log(scale = 1e3, threads = 1L)))
+ res_norm_implicit_partial <- select_features_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
+ expect_identical(res_norm_partial, res_norm_implicit_partial)
+ expect_true(!all((res_no_norm %>% dplyr::arrange(names))$score == (res_norm_partial %>% dplyr::arrange(names))$score))
+ }
+ }
+})
+
test_that("Wilcoxon rank sum works (small)", {
x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -161,4 +185,5 @@ test_that("Pseudobulk aggregation works with multiple return types", {
}
}
}
-})
\ No newline at end of file
+})
+