diff --git a/r/NAMESPACE b/r/NAMESPACE index 982ca440..e9998384 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -65,6 +65,8 @@ export(min_by_row) export(min_scalar) export(multiply_cols) export(multiply_rows) +export(normalize_log) +export(normalize_tfidf) export(nucleosome_counts) export(open_fragments_10x) export(open_fragments_dir) @@ -108,6 +110,8 @@ export(rowVars.default) export(sctransform_pearson) export(select_cells) export(select_chromosomes) +export(select_features_mean) +export(select_features_variance) export(select_regions) export(set_trackplot_height) export(set_trackplot_label) diff --git a/r/NEWS.md b/r/NEWS.md index 6cab91a5..dc220204 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -10,6 +10,7 @@ Contributions welcome :) ## Features - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166) +- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168) ## Bug-fixes - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) @@ -39,6 +40,8 @@ of the new features this release and will continue to help with maintenance and - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices. If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) +- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) +- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169) ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 7267e19b..37f61d98 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -6,6 +6,146 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. + +################# +# Feature selection +################# + +#' Feature selection functions +#' +#' Apply a feature selection method to a `(features x cells)` matrix. +#' @rdname feature_selection +#' @param mat (IterableMatrix) dimensions features x cells +#' @param num_feats (float) Number of features to return. If the number given is between 0 and 1, treat as a proportion of +#' the number of rows, rounded down. Otherwise, treat as an absolute number. +#' If the number is higher than the number of features in the matrix, +#' all features will be returned. +#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed. +#' @param threads (integer) Number of threads to use. +#' @returns +#' Return a dataframe with the following columns, sorted descending by score: +#' - `names`: Feature name. +#' - `score`: Scoring of the feature, depending on the method used. +#' - `highly_variable`: Logical vector of whether the feature is highly variable. +#' +#' Each different feature selection method will have a different scoring method: +#' - `select_features_by_variance`: Score representing variance of each feature. +#' @details +#' `select_features_by_variance` Calculates the variance of each feature using the following process: +#' 1. Perform an optional term frequency + log normalization, for each feature. +#' 2. Find `num_feats` features with the highest variance. +#' @export +select_features_variance <- function( + mat, num_feats = 0.05, + normalize = NULL, + threads = 1L +) { + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is(num_feats, "numeric") + if (rlang::is_missing(mat)) { + return(create_partial( + missing_args = list( + num_feats = missing(num_feats), + normalize = missing(normalize), + threads = missing(threads) + ) + )) + } + assert_is(mat, "IterableMatrix") + num_feats <- min(max(num_feats, 0), nrow(mat)) + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) + if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) + features_df <- tibble::tibble( + names = rownames(mat), + score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",] + ) %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) + return(features_df) +} + + +#' @rdname feature_selection +#' @returns +#' - `select_features_by_dispersion`: Score representing the dispersion of each feature. +#' @details +#' `select_features_by_dispersion` calculates the dispersion of each feature using the following process: +#' 1. Perform an optional term frequency + log normalization, for each feature. +#' 2. Find the dispersion (variance/mean) of each feature. +#' 3. Find `num_feats` features with the highest dispersion. +select_features_dispersion <- function( + mat, num_feats = 0.05, + normalize = NULL, + threads = 1L +) { + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is(num_feats, "numeric") + if (rlang::is_missing(mat)) { + return(create_partial( + missing_args = list( + num_feats = missing(num_feats), + normalize = missing(normalize), + threads = missing(threads) + ) + )) + } + num_feats <- min(max(num_feats, 0), nrow(mat)) + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) + assert_is(mat, "IterableMatrix") + if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) + mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads) + features_df <- tibble::tibble( + names = rownames(mat), + score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ] + ) %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) + return(features_df) +} + + +#' @rdname feature_selection +#' @returns +#' - `select_features_by_mean`: Score representing the mean accessibility of each feature. +#' @details +#' `select_features_by_mean` calculates the mean accessibility of each feature using the following process: +#' 1. Get the sum of each binarized feature. +#' 2. Find `num_feats` features with the highest accessibility. +#' @export +select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) { + assert_is_wholenumber(num_feats) + assert_greater_than_zero(num_feats) + assert_is(num_feats, "numeric") + if (rlang::is_missing(mat)) { + return(create_partial( + missing_args = list( + num_feats = missing(num_feats), + normalize = missing(normalize), + threads = missing(threads) + ) + )) + } + assert_is(mat, "IterableMatrix") + num_feats <- min(max(num_feats, 0), nrow(mat)) + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) + if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) + # get the sum of each feature, binarized + # get the top features + + features_df <- tibble::tibble( + names = rownames(mat), + score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ] + ) %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) + return(features_df) +} + + #' Test for marker features #' #' Given a features x cells matrix, perform one-vs-all differential diff --git a/r/R/transforms.R b/r/R/transforms.R index 2a5de994..3ce8ac56 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -923,3 +923,89 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { vars_to_regress = vars_to_regress ) } + +################# +# Normalizations +################# + +#' Normalization helper functions +#' +#' Apply standard normalizations to a `(features x cells)` counts matrix. +#' +#' @rdname normalize +#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)` +#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization. +#' @param threads (integer) Number of threads to use. +#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as: +#' +#' - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog` +#' @export +normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { + assert_is_numeric(scale_factor) + assert_greater_than_zero(scale_factor) + if (rlang::is_missing(mat)) { + return( + create_partial( + missing_args = list( + scale_factor = missing(scale_factor), + threads = missing(threads) + ) + ) + ) + } + assert_is(mat, "IterableMatrix") + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) + mat <- mat %>% multiply_cols(1 / read_depth) + return(log1p(mat * scale_factor)) +} + + +#' @rdname normalize +#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then +#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +#' Else, map each feature name to its mean value. +#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()` +#' @export +normalize_tfidf <- function( + mat, feature_means = NULL, + scale_factor = 1e4, threads = 1L +) { + assert_is_wholenumber(threads) + if (rlang::is_missing(mat)) { + return( + create_partial( + missing_args = list( + feature_means = missing(feature_means), + scale_factor = missing(scale_factor), + threads = missing(threads) + ) + ) + ) + } + assert_is(mat, "IterableMatrix") + # If feature means are passed in, only need to calculate term frequency + if (is.null(feature_means)) { + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + feature_means <- mat_stats$row_stats["mean", ] + read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) + } else { + assert_is_numeric(feature_means) + if (!is.null(names(feature_means)) && !is.null(rownames(mat))) { + # Make sure every name in feature means exists in rownames(mat) + # In the case there is a length mismatch but the feature names all exist in feature_means + # will not error out + assert_true(all(rownames(mat) %in% names(feature_means))) + feature_means <- feature_means[rownames(mat)] + } else { + assert_len(feature_means, nrow(mat)) + } + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) + } + tf <- mat %>% multiply_cols(1 / read_depth) + idf <- 1 / feature_means + tf_idf_mat <- tf %>% multiply_rows(idf) + return(log1p(tf_idf_mat * scale_factor)) +} \ No newline at end of file diff --git a/r/R/utils.R b/r/R/utils.R index 4ea62d15..d7c65358 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -56,4 +56,69 @@ log_progress <- function(msg, add_timestamp = TRUE){ } else { message(msg) } +} + +#' Helper to create partial functions +#' +#' Automatically creates a partial application of the caller +#' function including all non-missing arguments. +#' +#' @param missing_args (named list[bool]) Any named index with a TRUE value +#' will be treated as missing. Designed to be used in the caller with the +#' `base::missing()` function to detect unspecified arguments with default values, +#' or to manually specifiy other arguments that should not be specialized +#' @return A `bpcells_partial` object (a function with some extra attributes) +#' @keywords internal +create_partial <- function(missing_args=list()) { + env <- rlang::caller_env() + fn_sym <- rlang::caller_call()[[1]] + fn <- rlang::caller_fn() + + args <- list() + for (n in names(formals(fn))) { + if (n %in% names(missing_args) && missing_args[[n]]) next + if (rlang::is_missing(env[[n]])) next + args[[n]] <- env[[n]] + } + + ret <- do.call(partial_apply, c(fn, args)) + attr(ret, "body")[[1]] <- fn_sym + return(ret) +} + + +#' Create partial function calls +#' +#' Specify some but not all arguments to a function. +#' +#' @param f A function +#' @param ... Named arguments to `f` +#' @param .overwrite (bool) If `f` is already an output from +#' `partial_apply()`, whether parameter re-definitions should +#' be ignored or overwrite the existing definitions +#' @return A `bpcells_partial` object (a function with some extra attributes) +#' @keywords internal +partial_apply <- function(f, ..., .overwrite=FALSE) { + args <- rlang::list2(...) + + if (is(f, "bpcells_partial")) { + prev_args <- attr(f, "args") + for (a in names(prev_args)) { + if (!(.overwrite && a %in% names(args))) { + args[[a]] <- prev_args[[a]] + } + } + f <- attr(f, "fn") + function_name <- attr(f, "body")[[1]] + } else { + function_name <- rlang::sym(rlang::caller_arg(f)) + } + partial_fn <- do.call(purrr::partial, c(f, args)) + attr(partial_fn, "body")[[1]] <- function_name + structure( + partial_fn, + class = c("bpcells_partial", "purrr_function_partial", "function"), + args = args, + fn = f + ) } \ No newline at end of file diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd new file mode 100644 index 00000000..fe92fd5a --- /dev/null +++ b/r/man/create_partial.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{create_partial} +\alias{create_partial} +\title{Helper to create partial functions} +\usage{ +create_partial(missing_args = list()) +} +\arguments{ +\item{missing_args}{(named list\link{bool}) Any named index with a TRUE value +will be treated as missing. Designed to be used in the caller with the +\code{base::missing()} function to detect unspecified arguments with default values, +or to manually specifiy other arguments that should not be specialized} +} +\value{ +A \code{bpcells_partial} object (a function with some extra attributes) +} +\description{ +Automatically creates a partial application of the caller +function including all non-missing arguments. +} +\keyword{internal} diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd new file mode 100644 index 00000000..f83e2199 --- /dev/null +++ b/r/man/feature_selection.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{select_features_variance} +\alias{select_features_variance} +\alias{select_features_dispersion} +\alias{select_features_mean} +\title{Feature selection functions} +\usage{ +select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L) + +select_features_dispersion( + mat, + num_feats = 0.05, + normalize = NULL, + threads = 1L +) + +select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{(float) Number of features to return. If the number given is between 0 and 1, treat as a proportion of +the number of rows, rounded down. Otherwise, treat as an absolute number. +If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +Return a dataframe with the following columns, sorted descending by score: +\itemize{ +\item \code{names}: Feature name. +\item \code{score}: Scoring of the feature, depending on the method used. +\item \code{highly_variable}: Logical vector of whether the feature is highly variable. +} + +Each different feature selection method will have a different scoring method: +\itemize{ +\item \code{select_features_by_variance}: Score representing variance of each feature. +} + +\itemize{ +\item \code{select_features_by_dispersion}: Score representing the dispersion of each feature. +} + +\itemize{ +\item \code{select_features_by_mean}: Score representing the mean accessibility of each feature. +} +} +\description{ +Apply a feature selection method to a \verb{(features x cells)} matrix. +} +\details{ +\code{select_features_by_variance} Calculates the variance of each feature using the following process: +\enumerate{ +\item Perform an optional term frequency + log normalization, for each feature. +\item Find \code{num_feats} features with the highest variance. +} + +\code{select_features_by_dispersion} calculates the dispersion of each feature using the following process: +\enumerate{ +\item Perform an optional term frequency + log normalization, for each feature. +\item Find the dispersion (variance/mean) of each feature. +\item Find \code{num_feats} features with the highest dispersion. +} + +\code{select_features_by_mean} calculates the mean accessibility of each feature using the following process: +\enumerate{ +\item Get the sum of each binarized feature. +\item Find \code{num_feats} features with the highest accessibility. +} +} diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd new file mode 100644 index 00000000..ac53f2c0 --- /dev/null +++ b/r/man/normalize.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_log} +\alias{normalize_log} +\alias{normalize_tfidf} +\title{Normalization helper functions} +\usage{ +normalize_log(mat, scale_factor = 10000, threads = 1L) + +normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) Counts matrix to normalize. \verb{(features x cells)}} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} + +\item{threads}{(integer) Number of threads to use.} + +\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then +each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +Else, map each feature name to its mean value.} +} +\value{ +For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as: +\itemize{ +\item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +} + +\itemize{ +\item \code{normalize_tfidf}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +} +} +\description{ +Apply standard normalizations to a \verb{(features x cells)} counts matrix. +} +\details{ +\itemize{ +\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeLog} +} + +\itemize{ +\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, used by default in \code{ArchR::addIterativeLSI()} and \code{Signac::RunTFIDF()} +} +} diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd new file mode 100644 index 00000000..c1ea508d --- /dev/null +++ b/r/man/partial_apply.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{partial_apply} +\alias{partial_apply} +\title{Create partial function calls} +\usage{ +partial_apply(f, ..., .overwrite = FALSE) +} +\arguments{ +\item{f}{A function} + +\item{...}{Named arguments to \code{f}} + +\item{.overwrite}{(bool) If \code{f} is already an output from +\code{partial_apply()}, whether parameter re-definitions should +be ignored or overwrite the existing definitions} +} +\value{ +A \code{bpcells_partial} object (a function with some extra attributes) +} +\description{ +Specify some but not all arguments to a function. +} +\keyword{internal} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 64d16f01..0d98b13d 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -9,6 +9,11 @@ template: includes: in_header: | + + + + + after_body: | @@ -130,15 +135,12 @@ reference: - IterableMatrix-methods - pseudobulk_matrix -- title: "Reference Annotations" +- title: "Single-cell analysis helpers" +- subtitle: "Dimensionality reduction" - contents: - - human_gene_mapping - - match_gene_symbol - - read_gtf - - read_bed - - read_ucsc_chrom_sizes - -- title: "Clustering" + - normalize_log + - select_features_variance +- subtitle: "Clustering" - contents: - knn_hnsw - cluster_graph_leiden @@ -176,3 +178,11 @@ reference: - discrete_palette - collect_features - rotate_x_labels + +- title: "Reference Annotations" +- contents: + - human_gene_mapping + - match_gene_symbol + - read_gtf + - read_bed + - read_ucsc_chrom_sizes diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 385605e0..24cd9c23 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -346,3 +346,48 @@ test_that("linear regression works", { expect_equal(as(m1, "matrix"), ans) expect_equal(as(m1t, "matrix"), ans) }) + +test_that("tf-idf normalization works", { + m <- generate_sparse_matrix(5, 5) + rownames(m) <- paste0("row", seq_len(nrow(m))) + rev_rownames <- rev(rownames(m)) + # Create tf-idf normalization for dgCMatrix + res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4) + + rownames(res_dgc) <- rownames(m) + m2 <- as(m, "IterableMatrix") + # Check that we can pass in row means as a (named) vector + row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",] + # Test that row means ordering does not matter as long as names exist + row_means_shuffled <- row_means[sample(1:length(row_means))] + # Test that row means can have an extra element as long as all rownames are in the vector + row_means_plus_one <- c(row_means, row6 = 1) + + + res <- normalize_tfidf(m2) + expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6) + res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) + res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2) + expect_equal(res_with_row_means, res_with_row_means_partial) + expect_identical(res, res_with_row_means) + + res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled) + expect_identical(res_with_row_means, res_with_shuffled_row_means, res) + + res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one) + expect_identical(res, res_with_row_means_with_extra_element) +}) + +test_that("normalize_log works", { + m <- generate_sparse_matrix(5, 5) + res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix") + m2 <- as(m, "IterableMatrix") + # Test that default params yield the same as log1p on dgCMatrix + res_1 <- as(normalize_log(m2), "dgCMatrix") + expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6) + + # Test that changing scale factor works + res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") + res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix") + expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6) +}) \ No newline at end of file diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 85ad0be3..08e41b7c 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -13,6 +13,30 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = as(m, "dgCMatrix") } +test_that("select_features works general case", { + m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix") + for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) { + res <- do.call(fn, list(m1, num_feats = 5)) + expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting + expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable + expect_setequal(res$names, rownames(m1)) + res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows + res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) + res_feats_partial <- get(fn)(num_feats = 100)(m1) + expect_identical(res_feats_equal_rows, res_feats_partial) + expect_identical(res_more_feats_than_rows, res_feats_equal_rows) + if (fn == "select_features_variance") { + # Check that normalization actually does something + res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL)) + # Check that we can do partial functions on normalization too + res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = normalize_log(scale = 1e3, threads = 1L))) + res_norm_implicit_partial <- select_features_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1) + expect_identical(res_norm_partial, res_norm_implicit_partial) + expect_true(!all((res_no_norm %>% dplyr::arrange(names))$score == (res_norm_partial %>% dplyr::arrange(names))$score)) + } + } +}) + test_that("Wilcoxon rank sum works (small)", { x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46) y <- c(1.15, 0.88, 0.90, 0.74, 1.21) @@ -161,4 +185,5 @@ test_that("Pseudobulk aggregation works with multiple return types", { } } } -}) \ No newline at end of file +}) +