From 0795bfe01af8fef574fea8f4b74da477ad38d4c8 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Tue, 12 Nov 2024 18:40:01 -0800 Subject: [PATCH 1/5] [r] add poc matrix projection interface --- r/NAMESPACE | 15 + r/R/matrix_project_predict.R | 377 ++++++++++++++++++ r/man/Estimator.Rd | 17 + r/man/LSITransformer.Rd | 14 + r/man/Pipeline.Rd | 26 ++ r/man/PipelineBase.Rd | 15 + r/man/PipelineStep.Rd | 17 + r/man/Transformer.Rd | 27 ++ .../c-open-paren-PipelineBase-close-paren.Rd | 16 + ...PipelineBase-IterableMatrix-close-paren.Rd | 21 + ...PipelineBase-IterableMatrix-close-paren.Rd | 19 + ...PipelineBase-IterableMatrix-close-paren.Rd | 19 + r/pkgdown/_pkgdown.yml | 25 ++ 13 files changed, 608 insertions(+) create mode 100644 r/R/matrix_project_predict.R create mode 100644 r/man/Estimator.Rd create mode 100644 r/man/LSITransformer.Rd create mode 100644 r/man/Pipeline.Rd create mode 100644 r/man/PipelineBase.Rd create mode 100644 r/man/PipelineStep.Rd create mode 100644 r/man/Transformer.Rd create mode 100644 r/man/c-open-paren-PipelineBase-close-paren.Rd create mode 100644 r/man/fit-open-paren-PipelineBase-IterableMatrix-close-paren.Rd create mode 100644 r/man/predict-open-paren-PipelineBase-IterableMatrix-close-paren.Rd create mode 100644 r/man/transform-open-paren-PipelineBase-IterableMatrix-close-paren.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 304a2878..d576049d 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -7,6 +7,8 @@ S3method(svds,default) export("all_matrix_inputs<-") export("cellNames<-") export("chrNames<-") +export(LSITransformer) +export(Pipeline) export(add_cols) export(add_rows) export(all_matrix_inputs) @@ -42,6 +44,7 @@ export(discrete_palette) export(draw_trackplot_grid) export(expm1_slow) export(extend_ranges) +export(fit) export(footprint) export(gene_region) export(gene_score_archr) @@ -84,6 +87,7 @@ export(plot_read_count_knee) export(plot_tf_footprint) export(plot_tss_profile) export(plot_tss_scatter) +export(predict) export(prefix_cell_names) export(pseudobulk_matrix) export(qc_scATAC) @@ -125,6 +129,7 @@ export(trackplot_gene) export(trackplot_genome_annotation) export(trackplot_loop) export(trackplot_scalebar) +export(transform) export(transpose_storage_order) export(write_fragments_10x) export(write_fragments_dir) @@ -136,9 +141,19 @@ export(write_matrix_anndata_hdf5) export(write_matrix_dir) export(write_matrix_hdf5) export(write_matrix_memory) +exportClasses(Estimator) +exportClasses(LSITransformer) +exportClasses(PipelineBase) +exportClasses(PipelineStep) +exportClasses(Transformer) exportMethods(as.data.frame) exportMethods(as.matrix) +exportMethods(c) +exportMethods(fit) +exportMethods(predict) +exportMethods(show) exportMethods(t) +exportMethods(transform) importClassesFrom(Matrix,dgCMatrix) importFrom(Matrix,colMeans) importFrom(Matrix,colSums) diff --git a/r/R/matrix_project_predict.R b/r/R/matrix_project_predict.R new file mode 100644 index 00000000..f769d713 --- /dev/null +++ b/r/R/matrix_project_predict.R @@ -0,0 +1,377 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + + +#' Pipeline Base Class +#' @slot fitted (logical) Whether the pipeline has been fitted +#' @name PipelineBase +#' @export +setClass("PipelineBase", + contains = "VIRTUAL", + slots = list( + fitted = "logical" + ), + prototype = list( + fitted = FALSE + ) +) + +#' Fit the pipeline object to data +#' @param object (PipelineBase) The pipeline object to fit. +#' @param x (IterableMatrix) Input data to be fitted on. +#' @param y Optional output data to be fitted on. Required if the final step is an Estimator, else ignored. +#' @return The fitted pipeline object. +#' @name fit(PipelineBase,IterableMatrix) +#' @export +setGeneric("fit", function(object, x, y = NULL, ...) standardGeneric("fit")) + +#' @export +setMethod("fit", signature(object = "PipelineBase", x = "IterableMatrix"), function(object, x, y = NULL, ...) { + stop("fit() method not implemented for PipelineBase") +}) + +#' Transform the input data using a fitted pipeline +#' @param object (PipelineBase) The fitted pipeline object +#' @param x (IterableMatrix) Input data to be transformed +#' @return Data transformed by the pipeline +#' @name transform(PipelineBase,IterableMatrix) +#' @export +setGeneric("transform", function(object, x, ...) standardGeneric("transform")) + +#' @export +setMethod("transform", signature(object = "PipelineBase", x = "IterableMatrix"), function(object, x, ...) { + stop("transform() method not implemented for PipelineBase") +}) + +#' Predict the output data using a fitted pipeline +#' @param object (PipelineBase) The fitted pipeline object +#' @param x (IterableMatrix) Input data to be predicted +#' @return Predicted output data +#' @name predict(PipelineBase,IterableMatrix) +#' @export +setGeneric("predict", function(object, x, ...) standardGeneric("predict")) + +#' @export +setMethod("predict", signature(object = "PipelineBase", x = "IterableMatrix"), function(object, x, ...) { + stop("predict() method not implemented for PipelineBase") +}) + +#' Combine pipeline objects, to create a new pipeline object. +#' @param x (PipelineBase) The pipeline object to combine to. +#' @param ... (PipelineBase) The pipeline objects to combine from. +#' @name c(PipelineBase) +#' @export +setMethod("c", signature(x = "PipelineBase"), function(x, ...) { + stop("c() method not implemented for PipelineBase") +}) + +setMethod("show", signature(object = "PipelineBase"), function(object) { + stop("show() method not implemented for PipelineBase") +}) + +#' S4 class for combining multiple pipeline steps into a single pipeline +#' @slot steps (list) List of pipeline steps to execute in order +#' @name Pipeline +setClass( + "Pipeline", + contains = "PipelineBase", + slots = list( + steps = "list" + ), + prototype = list( + steps = list() + ) +) + +#' Return a new Pipeline object +#' @param steps A list of ordered steps to be executed in the pipeline. +#' @return A new Pipeline object. +#' @export +Pipeline <- function(steps = list()) { + return(new("Pipeline", steps = steps)) +} + +#' Fit the pipeline object to data +#' @param object (Pipeline) The pipeline object to fit. +#' @param x (IterableMatrix) Input data to be fitted on. +#' @param y Optional output data to be fitted on. Required if the final step is an Estimator, else ignored. +#' @return The fitted pipeline object. +#' @noRd +#' @export +setMethod("fit", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, y = NULL, ...) { + steps <- object@steps + # Check if all steps are transformers, with the final step being either an estimator or a transformer + for (i in seq_along(steps)) { + step <- steps[[i]] + # allow to fit with estimators as well + if (i < length(steps)) { + assert_is(step, "PipelineStep") + } else { + assert_is(step, c("PipelineStep")) + if (!is.null(y)) { + assert_is(step, "Estimator") + } + } + } + # Fit every step in the pipeline + for (i in seq_along(steps)) { + step <- steps[[i]] + # allow to fit with estimators as well + if (i < length(steps) || is.null(y)) { + step <- fit(step, x, ...) + x <- transform(step, x) + if (is(x, "dgCMatrix")) x <- as(x, "IterableMatrix") + } else { + step <- fit(step, x, y, ...) + } + steps[[i]] <- step + } + object@steps <- steps + object@fitted <- TRUE + return(object) +}) + + +#' Transform the input data using a fitted pipeline +#' @param object (Pipeline) The fitted pipeline object +#' @param x (IterableMatrix) Input data to be transformed +#' @return Data transformed by the pipeline +#' @noRd +#' @export +setMethod("transform", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, ...) { + if (!object@fitted) stop("Pipeline must be fitted before transforming") + steps <- object@steps + for (step in steps) { + if (is(step, "Transformer")) x <- transform(step, x) + # Some actions convert matrices to a different type, so we need to convert back to IterableMatrix + # for following steps + if (is(x, "dgCMatrix")) x <- as(x, "IterableMatrix") + } + return(x) +}) + + +#' Predict the output data using a fitted pipeline +#' @param object (Pipeline) The fitted pipeline object +#' @param x (IterableMatrix) Input data to be predicted +#' @return Predicted output data +#' @noRd +#' @export +setMethod("predict", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, ...) { + if (!object@fitted) stop("Pipeline must be fitted before predicting") + steps <- object@steps + for (i in seq_along(steps)) { + step <- steps[[i]] + if (i < n_steps) { + x <- transform(step, x) + } else if (is(step, "Estimator")) { + y_pred <- predict(step, x) + return(y_pred) + } else { + stop("The final step must be an estimator with a predict method") + } + } +}) + +setMethod("short_description", "Pipeline", function(x) { + character(0) +}) + +#' Show the pipeline steps, demonstrating how to recreate the pipeline with a function call. +#' @param object (Pipeline) The pipeline object to show +#' @noRd +#' @export +setMethod("show", signature(object = "Pipeline"), function(object) { + fitted <- ifelse(object@fitted, "Fitted", "Unfitted") + cat(fitted, " Pipeline with steps:\n") + cat("Pipeline(\n") + for (i in seq_along(object@steps)) { + step <- object@steps[[i]] + cat("\t", short_description(step)) + if (i < length(object@steps)) { + cat(",") + } + cat("\n") + } + cat(")\n") +}) + +#' Add steps to a pipeline, where the first argument is the pipeline object and the rest are the steps to add. +#' Requires for every additional step to be a pipeline object +#' @param x (Pipeline) The PipelineBase object to add steps to +#' @param ... (PipelineBase) The steps to add to the pipeline +#' @noRd +#' @export +setMethod("c", signature(x = "Pipeline"), function(x, ...) { + pipelines <- list(...) + steps <- x@steps + for (pipe in pipelines) { + assert_is(pipe, "PipelineBase") + # If the step is a pipeline, combine the steps. Else, add the single step. + steps <- ifelse(is(pipe, "PipelineStep"), c(steps, pipe), c(steps, pipe@steps)) + } + + # If all the steps are fitted, the pipeline overall is fitted. + # We trust the user to have fitted the pipelines with the same data + new_pipeline <- Pipeline(steps = steps) + fitted <- TRUE + for (step in steps) { + if (!step@fitted) { + fitted <- FALSE + } + } + new_pipeline@fitted <- fitted + return(new_pipeline) +}) + +#' S4 Class Representing a single transformer or predictor +#' @slot step_name (character) Name of the step +#' @slot fitted (logical) Whether the pipeline has been fitted +#' @name PipelineStep +#' @export +setClass("PipelineStep", + contains = "PipelineBase", + slots = list( + step_name = "character" + ), + prototype = list( + step_name = "" + ) +) + +#' Create a Pipeline out of pipeline steps. +#' @param x (PipelineStep) The initial pipeline step we want to add to the pipeline +#' @param ... (PipelineBase) The additional pipeline steps to add to the pipeline. These can be either PipelineStep or Pipeline objects. +#' @return A new Pipeline object with the steps added. +#' @noRd +#' @export +setMethod("c", signature(x = "PipelineStep"), function(x, ...) { + pipelines <- list(...) + steps <- list(x) + for (pipe in pipelines) { + assert_is(pipe, "PipelineBase") + steps <- ifelse(is(pipe, "PipelineStep"), c(steps, pipe), c(steps, pipe@steps)) + } + new_pipeline <- Pipeline(steps = steps) + fitted <- TRUE + for (step in steps) { + if (!step@fitted) fitted <- FALSE + } +}) + + +setMethod("show", signature(object = "PipelineStep"), function(object) { + cat(short_description(object)) + cat("\n") +}) + + +#' S4 Class representing an operation that transforms data, and holds fitted parameters +#' @slot step_name (character) Name of the step +#' @slot fitted (logical) Whether the pipeline has been fitted +#' @details Transformers represent single operations (derived from the PipelineStep class) that transform data. +#' They can be fit to data within an IterableMatrix object, which will be used to hold the fitted parameters. +#' Using the transform method on a fitted transformer will apply the transformation to the data and return +#' the transformed data as an IterableMatrix object. +#' +#' These objects can be combined into a Pipeline object using the `c()` function, with other transformers, or estimators. +#' Transformers can also be combined with full pipelines, to create a new pipeline object. +#' Derived classes should implement the `fit()`, `transform()`, and `short_description()` methods. +#' @name Transformer +#' @export +setClass("Transformer", + contains = "PipelineStep" +) + + +#' Perform latent semantic indexing (LSI) on a matrix. +#' @name LSITransformer +#' @export +setClass("LSITransformer", + contains = "Transformer", + slots = list( + idf_ = "numeric", + svd_attr_ = "list", + z_score_norm = "logical", + n_dimensions = "integer", + scale_factor = "integer", + threads = "integer" + ), + prototype = list( + idf_ = numeric(0), + svd_attr_ = list(), + z_score_norm = FALSE, + n_dimensions = 20L, + scale_factor = 1e4L, + threads = 1L + ) +) + +#' Create a new LSITransformer object +#' @export +LSITransformer <- function(z_score_norm, n_dimensions, scale_factor, threads) { + return(new( + "LSITransformer", z_score_norm = z_score_norm, n_dimensions = n_dimensions, + scale_factor = scale_factor, threads = threads, step_name = "LSITransformer")) +} + +setMethod("fit", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { + ret <- lsi( + x, z_score_norm = object@z_score_norm, n_dimensions = object@n_dimensions, + scale_factor = object@scale_factor, threads = object@threads, + save_lsi = TRUE + ) + object@idf_ <- ret$idf + object@svd_attr_ <- ret$svd_attr + object@fitted <- TRUE + return(object) +}) + +setMethod("transform", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { + # rudimentary implementation -- Works but is duplicate code. + assert_true(object@fitted) + # Wait until LSI PR has been reviewed + npeaks <- colSums(x) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` + tf <- x %>% multiply_cols(1 / npeaks) + mat_tfidf <- tf %>% multiply_rows(object@idf_) + mat_log_tfidf <- log1p(object@scale_factor * mat_tfidf) + mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) + if (object@z_score_norm) { + cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = object@threads)$col_stats + cell_means <- cell_peak_stats["mean",] + cell_vars <- cell_peak_stats["variance",] + mat_log_tfidf <- mat_log_tfidf %>% + add_cols(-cell_means) %>% + multiply_cols(1 / cell_vars) + } + pca_res <- t(object@svd_attr_$u) %*% mat_log_tfidf + return(pca_res) +}) + +setMethod("short_description", "LSITransformer", function(x) { + return(sprintf("LSITransformer(z_score_norm=%s, n_dimensions=%d, scale_factor=%d, threads=%d)", + x@z_score_norm, x@n_dimensions, x@scale_factor, x@threads)) +}) + +setClass("VarFeatSelectorTransformer", + contains = "Transformer", + slots = list( + num_feats = "integer", + n_bins = "integer" + ) +) + + +#' S4 Class representing an operation that predicts data, and holds fitted parameters. +#' @slot step_name (character) Name of the step +#' @slot fitted (logical) Whether the pipeline has been fitted +#' @name Estimator +#' @export +setClass("Estimator", + contains = "PipelineStep" +) \ No newline at end of file diff --git a/r/man/Estimator.Rd b/r/man/Estimator.Rd new file mode 100644 index 00000000..014f701c --- /dev/null +++ b/r/man/Estimator.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\docType{class} +\name{Estimator} +\alias{Estimator} +\title{S4 Class representing an operation that predicts data, and holds fitted parameters.} +\description{ +S4 Class representing an operation that predicts data, and holds fitted parameters. +} +\section{Slots}{ + +\describe{ +\item{\code{step_name}}{(character) Name of the step} + +\item{\code{fitted}}{(logical) Whether the pipeline has been fitted} +}} + diff --git a/r/man/LSITransformer.Rd b/r/man/LSITransformer.Rd new file mode 100644 index 00000000..dbc15831 --- /dev/null +++ b/r/man/LSITransformer.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\docType{class} +\name{LSITransformer} +\alias{LSITransformer} +\title{Perform latent semantic indexing (LSI) on a matrix.} +\usage{ +LSITransformer(z_score_norm, n_dimensions, scale_factor, threads) +} +\description{ +Perform latent semantic indexing (LSI) on a matrix. + +Create a new LSITransformer object +} diff --git a/r/man/Pipeline.Rd b/r/man/Pipeline.Rd new file mode 100644 index 00000000..c589a33f --- /dev/null +++ b/r/man/Pipeline.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\docType{class} +\name{Pipeline} +\alias{Pipeline} +\title{S4 class for combining multiple pipeline steps into a single pipeline} +\usage{ +Pipeline(steps = list()) +} +\arguments{ +\item{steps}{A list of ordered steps to be executed in the pipeline.} +} +\value{ +A new Pipeline object. +} +\description{ +S4 class for combining multiple pipeline steps into a single pipeline + +Return a new Pipeline object +} +\section{Slots}{ + +\describe{ +\item{\code{steps}}{(list) List of pipeline steps to execute in order} +}} + diff --git a/r/man/PipelineBase.Rd b/r/man/PipelineBase.Rd new file mode 100644 index 00000000..bccad74a --- /dev/null +++ b/r/man/PipelineBase.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\docType{class} +\name{PipelineBase} +\alias{PipelineBase} +\title{Pipeline Base Class} +\description{ +Pipeline Base Class +} +\section{Slots}{ + +\describe{ +\item{\code{fitted}}{(logical) Whether the pipeline has been fitted} +}} + diff --git a/r/man/PipelineStep.Rd b/r/man/PipelineStep.Rd new file mode 100644 index 00000000..6bb56b0a --- /dev/null +++ b/r/man/PipelineStep.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\docType{class} +\name{PipelineStep} +\alias{PipelineStep} +\title{S4 Class Representing a single transformer or predictor} +\description{ +S4 Class Representing a single transformer or predictor +} +\section{Slots}{ + +\describe{ +\item{\code{step_name}}{(character) Name of the step} + +\item{\code{fitted}}{(logical) Whether the pipeline has been fitted} +}} + diff --git a/r/man/Transformer.Rd b/r/man/Transformer.Rd new file mode 100644 index 00000000..1b117cff --- /dev/null +++ b/r/man/Transformer.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\docType{class} +\name{Transformer} +\alias{Transformer} +\title{S4 Class representing an operation that transforms data, and holds fitted parameters} +\description{ +S4 Class representing an operation that transforms data, and holds fitted parameters +} +\details{ +Transformers represent single operations (derived from the PipelineStep class) that transform data. +They can be fit to data within an IterableMatrix object, which will be used to hold the fitted parameters. +Using the transform method on a fitted transformer will apply the transformation to the data and return +the transformed data as an IterableMatrix object. + +These objects can be combined into a Pipeline object using the \code{c()} function, with other transformers, or estimators. +Transformers can also be combined with full pipelines, to create a new pipeline object. +Derived classes should implement the \code{fit()}, \code{transform()}, and \code{short_description()} methods. +} +\section{Slots}{ + +\describe{ +\item{\code{step_name}}{(character) Name of the step} + +\item{\code{fitted}}{(logical) Whether the pipeline has been fitted} +}} + diff --git a/r/man/c-open-paren-PipelineBase-close-paren.Rd b/r/man/c-open-paren-PipelineBase-close-paren.Rd new file mode 100644 index 00000000..efe4cd9c --- /dev/null +++ b/r/man/c-open-paren-PipelineBase-close-paren.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\name{c(PipelineBase)} +\alias{c(PipelineBase)} +\title{Combine pipeline objects, to create a new pipeline object.} +\usage{ +\S4method{c}{PipelineBase}(x, ...) +} +\arguments{ +\item{x}{(PipelineBase) The pipeline object to combine to.} + +\item{...}{(PipelineBase) The pipeline objects to combine from.} +} +\description{ +Combine pipeline objects, to create a new pipeline object. +} diff --git a/r/man/fit-open-paren-PipelineBase-IterableMatrix-close-paren.Rd b/r/man/fit-open-paren-PipelineBase-IterableMatrix-close-paren.Rd new file mode 100644 index 00000000..b722ea85 --- /dev/null +++ b/r/man/fit-open-paren-PipelineBase-IterableMatrix-close-paren.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\name{fit(PipelineBase,IterableMatrix)} +\alias{fit(PipelineBase,IterableMatrix)} +\title{Fit the pipeline object to data} +\usage{ +fit(object, x, y = NULL, ...) +} +\arguments{ +\item{object}{(PipelineBase) The pipeline object to fit.} + +\item{x}{(IterableMatrix) Input data to be fitted on.} + +\item{y}{Optional output data to be fitted on. Required if the final step is an Estimator, else ignored.} +} +\value{ +The fitted pipeline object. +} +\description{ +Fit the pipeline object to data +} diff --git a/r/man/predict-open-paren-PipelineBase-IterableMatrix-close-paren.Rd b/r/man/predict-open-paren-PipelineBase-IterableMatrix-close-paren.Rd new file mode 100644 index 00000000..98f603b9 --- /dev/null +++ b/r/man/predict-open-paren-PipelineBase-IterableMatrix-close-paren.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\name{predict(PipelineBase,IterableMatrix)} +\alias{predict(PipelineBase,IterableMatrix)} +\title{Predict the output data using a fitted pipeline} +\usage{ +predict(object, x, ...) +} +\arguments{ +\item{object}{(PipelineBase) The fitted pipeline object} + +\item{x}{(IterableMatrix) Input data to be predicted} +} +\value{ +Predicted output data +} +\description{ +Predict the output data using a fitted pipeline +} diff --git a/r/man/transform-open-paren-PipelineBase-IterableMatrix-close-paren.Rd b/r/man/transform-open-paren-PipelineBase-IterableMatrix-close-paren.Rd new file mode 100644 index 00000000..5187c12c --- /dev/null +++ b/r/man/transform-open-paren-PipelineBase-IterableMatrix-close-paren.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_project_predict.R +\name{transform(PipelineBase,IterableMatrix)} +\alias{transform(PipelineBase,IterableMatrix)} +\title{Transform the input data using a fitted pipeline} +\usage{ +transform(object, x, ...) +} +\arguments{ +\item{object}{(PipelineBase) The fitted pipeline object} + +\item{x}{(IterableMatrix) Input data to be transformed} +} +\value{ +Data transformed by the pipeline +} +\description{ +Transform the input data using a fitted pipeline +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index c08f55f0..04570ff9 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -173,3 +173,28 @@ reference: - discrete_palette - collect_features - rotate_x_labels + + +- title: "Matrix Projections and Predictions" + +- subtitle: "Pipeline S4 Classes" +- desc: Base class for pipeline objects +- contents: + - PipelineBase + - PipelineStep + - Pipeline + - Transformer + - Estimator + + +- subtitle: "Pipeline Methods" +- contents: + - fit(PipelineBase,IterableMatrix) + - predict(PipelineBase,IterableMatrix) + - transform(PipelineBase,IterableMatrix) + - c(PipelineBase) + + +- subtitle: "Transformers" +- contents: + - LSITransformer From ce0aa214618af1c85a6adc633b5440377b7b2413 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 13 Nov 2024 18:40:54 -0800 Subject: [PATCH 2/5] [r] clean up generics, rename functions, provide more docstrings --- r/NAMESPACE | 10 +- ...x_project_predict.R => matrix_pipelines.R} | 223 +++++++----------- r/man/Estimator.Rd | 16 +- r/man/LSITransformer.Rd | 14 -- r/man/Pipeline.Rd | 9 +- r/man/PipelineBase.Rd | 15 -- r/man/PipelineStep.Rd | 6 +- r/man/Transformer.Rd | 14 +- .../c-open-paren-PipelineBase-close-paren.Rd | 2 +- r/man/call_macs_peaks.Rd | 14 ++ ...PipelineBase-IterableMatrix-close-paren.Rd | 19 ++ ...PipelineBase-IterableMatrix-close-paren.Rd | 16 +- ...PipelineBase-IterableMatrix-close-paren.Rd | 19 -- ...PipelineBase-IterableMatrix-close-paren.Rd | 19 ++ ...PipelineBase-IterableMatrix-close-paren.Rd | 19 -- r/man/write_insertion_bed.Rd | 2 +- r/pkgdown/_pkgdown.yml | 10 +- 17 files changed, 187 insertions(+), 240 deletions(-) rename r/R/{matrix_project_predict.R => matrix_pipelines.R} (58%) delete mode 100644 r/man/LSITransformer.Rd delete mode 100644 r/man/PipelineBase.Rd create mode 100644 r/man/call_macs_peaks.Rd create mode 100644 r/man/estimate-open-paren-PipelineBase-IterableMatrix-close-paren.Rd delete mode 100644 r/man/predict-open-paren-PipelineBase-IterableMatrix-close-paren.Rd create mode 100644 r/man/project-open-paren-PipelineBase-IterableMatrix-close-paren.Rd delete mode 100644 r/man/transform-open-paren-PipelineBase-IterableMatrix-close-paren.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index d576049d..55d4adef 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -7,7 +7,6 @@ S3method(svds,default) export("all_matrix_inputs<-") export("cellNames<-") export("chrNames<-") -export(LSITransformer) export(Pipeline) export(add_cols) export(add_rows) @@ -42,6 +41,7 @@ export(convert_matrix_type) export(convert_to_fragments) export(discrete_palette) export(draw_trackplot_grid) +export(estimate) export(expm1_slow) export(extend_ranges) export(fit) @@ -87,8 +87,8 @@ export(plot_read_count_knee) export(plot_tf_footprint) export(plot_tss_profile) export(plot_tss_scatter) -export(predict) export(prefix_cell_names) +export(project) export(pseudobulk_matrix) export(qc_scATAC) export(range_distance_to_nearest) @@ -129,7 +129,6 @@ export(trackplot_gene) export(trackplot_genome_annotation) export(trackplot_loop) export(trackplot_scalebar) -export(transform) export(transpose_storage_order) export(write_fragments_10x) export(write_fragments_dir) @@ -142,18 +141,17 @@ export(write_matrix_dir) export(write_matrix_hdf5) export(write_matrix_memory) exportClasses(Estimator) -exportClasses(LSITransformer) exportClasses(PipelineBase) exportClasses(PipelineStep) exportClasses(Transformer) exportMethods(as.data.frame) exportMethods(as.matrix) exportMethods(c) +exportMethods(estimate) exportMethods(fit) -exportMethods(predict) +exportMethods(project) exportMethods(show) exportMethods(t) -exportMethods(transform) importClassesFrom(Matrix,dgCMatrix) importFrom(Matrix,colMeans) importFrom(Matrix,colSums) diff --git a/r/R/matrix_project_predict.R b/r/R/matrix_pipelines.R similarity index 58% rename from r/R/matrix_project_predict.R rename to r/R/matrix_pipelines.R index f769d713..8571daed 100644 --- a/r/R/matrix_project_predict.R +++ b/r/R/matrix_pipelines.R @@ -10,6 +10,7 @@ #' Pipeline Base Class #' @slot fitted (logical) Whether the pipeline has been fitted #' @name PipelineBase +#' @noRd #' @export setClass("PipelineBase", contains = "VIRTUAL", @@ -21,44 +22,47 @@ setClass("PipelineBase", ) ) -#' Fit the pipeline object to data +#' Fit a pipeline object to data #' @param object (PipelineBase) The pipeline object to fit. #' @param x (IterableMatrix) Input data to be fitted on. -#' @param y Optional output data to be fitted on. Required if the final step is an Estimator, else ignored. +#' @param y Optional output data to be fitted on. Required if the final step is a supervised Estimator, else ignored. #' @return The fitted pipeline object. +#' @details The `fit()` method is used to fit a pipeline object to data and a potential label output. Within single estimators, the `fit()` method only +#' takes the input data to be fitted on. Within pipelines, the `fit()` method sequentially fits the transformers on each non-terminal step of the pipeline. More specifically, +#' The input data is transformed by each transformer, and used to fit the next transformer in the pipeline. If the final step is an estimator, the input IterableMatrix +#' and label (if supervised) are used to fit the estimator. +#' +#' The fitted pipeline object is returned, allowing for projection of new data. #' @name fit(PipelineBase,IterableMatrix) #' @export setGeneric("fit", function(object, x, y = NULL, ...) standardGeneric("fit")) - #' @export setMethod("fit", signature(object = "PipelineBase", x = "IterableMatrix"), function(object, x, y = NULL, ...) { stop("fit() method not implemented for PipelineBase") }) -#' Transform the input data using a fitted pipeline -#' @param object (PipelineBase) The fitted pipeline object +#' Project input data using a fitted pipeline +#' @param object (PipelineBase) A fitted pipeline object #' @param x (IterableMatrix) Input data to be transformed -#' @return Data transformed by the pipeline -#' @name transform(PipelineBase,IterableMatrix) +#' @return Data projected by the pipeline +#' @name project(PipelineBase,IterableMatrix) #' @export -setGeneric("transform", function(object, x, ...) standardGeneric("transform")) - +setGeneric("project", function(object, x, ...) standardGeneric("project")) #' @export -setMethod("transform", signature(object = "PipelineBase", x = "IterableMatrix"), function(object, x, ...) { - stop("transform() method not implemented for PipelineBase") +setMethod("project", signature(object = "PipelineBase", x = "IterableMatrix"), function(object, x, ...) { + stop("project() method not implemented for PipelineBase") }) -#' Predict the output data using a fitted pipeline -#' @param object (PipelineBase) The fitted pipeline object -#' @param x (IterableMatrix) Input data to be predicted -#' @return Predicted output data -#' @name predict(PipelineBase,IterableMatrix) +#' Estimate predictions on the output data using a fitted pipeline +#' @param object (PipelineBase) The fitted pipeline object. Either the final step is an Estimator, or the pipeline is a single Estimator. +#' @param x (IterableMatrix) Input data to be estimated on +#' @return Predicted output labels +#' @name estimate(PipelineBase,IterableMatrix) #' @export -setGeneric("predict", function(object, x, ...) standardGeneric("predict")) - +setGeneric("estimate", function(object, x, ...) standardGeneric("estimate")) #' @export -setMethod("predict", signature(object = "PipelineBase", x = "IterableMatrix"), function(object, x, ...) { - stop("predict() method not implemented for PipelineBase") +setMethod("estimate", signature(object = "PipelineBase", x = "IterableMatrix"), function(object, x, ...) { + stop("estimate() method not implemented for PipelineBase") }) #' Combine pipeline objects, to create a new pipeline object. @@ -69,7 +73,6 @@ setMethod("predict", signature(object = "PipelineBase", x = "IterableMatrix"), f setMethod("c", signature(x = "PipelineBase"), function(x, ...) { stop("c() method not implemented for PipelineBase") }) - setMethod("show", signature(object = "PipelineBase"), function(object) { stop("show() method not implemented for PipelineBase") }) @@ -88,11 +91,24 @@ setClass( ) ) -#' Return a new Pipeline object +#' Return a new Pipeline object. #' @param steps A list of ordered steps to be executed in the pipeline. #' @return A new Pipeline object. +#' @details Creating a pipeline object can be done by passing a list of pipeline steps to the constructor. +#' Creation only expects that all steps make logical sense. i.e., the final step can be either an Estimator or a Transformer, +#' but each intermediate step cannot be an Estimator. #' @export Pipeline <- function(steps = list()) { + # Check if all steps are transformers, with the final step being either an estimator or a transformer + for (i in seq_along(steps)) { + step <- steps[[i]] + # allow to fit with estimators as well + if (i < length(steps)) { + assert_is(step, "Transformer") + } else { + assert_is(step, "PipelineStep") + } + } return(new("Pipeline", steps = steps)) } @@ -124,7 +140,7 @@ setMethod("fit", signature(object = "Pipeline", x = "IterableMatrix"), function( # allow to fit with estimators as well if (i < length(steps) || is.null(y)) { step <- fit(step, x, ...) - x <- transform(step, x) + x <- project(step, x) if (is(x, "dgCMatrix")) x <- as(x, "IterableMatrix") } else { step <- fit(step, x, y, ...) @@ -137,17 +153,17 @@ setMethod("fit", signature(object = "Pipeline", x = "IterableMatrix"), function( }) -#' Transform the input data using a fitted pipeline +#' Project input data using a fitted pipeline #' @param object (Pipeline) The fitted pipeline object -#' @param x (IterableMatrix) Input data to be transformed -#' @return Data transformed by the pipeline +#' @param x (IterableMatrix) Input data to be used by the pipeline to project new data. +#' @return Data projected by the pipeline #' @noRd #' @export -setMethod("transform", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, ...) { - if (!object@fitted) stop("Pipeline must be fitted before transforming") +setMethod("project", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, ...) { + if (!object@fitted) stop("Pipeline must be fitted before projecting") steps <- object@steps for (step in steps) { - if (is(step, "Transformer")) x <- transform(step, x) + if (is(step, "Transformer")) x <- project(step, x) # Some actions convert matrices to a different type, so we need to convert back to IterableMatrix # for following steps if (is(x, "dgCMatrix")) x <- as(x, "IterableMatrix") @@ -156,27 +172,26 @@ setMethod("transform", signature(object = "Pipeline", x = "IterableMatrix"), fun }) -#' Predict the output data using a fitted pipeline -#' @param object (Pipeline) The fitted pipeline object -#' @param x (IterableMatrix) Input data to be predicted -#' @return Predicted output data -#' @noRd -#' @export -setMethod("predict", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, ...) { - if (!object@fitted) stop("Pipeline must be fitted before predicting") - steps <- object@steps - for (i in seq_along(steps)) { - step <- steps[[i]] - if (i < n_steps) { - x <- transform(step, x) - } else if (is(step, "Estimator")) { - y_pred <- predict(step, x) - return(y_pred) - } else { - stop("The final step must be an estimator with a predict method") - } - } -}) +# #' Estimate predictions on the output data using a fitted pipeline +# #' @param object (Pipeline) The fitted pipeline object +# #' @param x (IterableMatrix) Input data to be estimated on +# #' @noRd +# #' @export +# setMethod("estimate", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, ...) { +# if (!object@fitted) stop("Pipeline must be fitted before estimating") +# steps <- object@steps +# for (i in seq_along(steps)) { +# step <- steps[[i]] +# if (i < n_steps) { +# x <- project(step, x) +# } else if (is(step, "Estimator")) { +# y_pred <- estimate(step, x) +# return(y_pred) +# } else { +# stop("The final step must be an estimator with a estimate method") +# } +# } +# }) setMethod("short_description", "Pipeline", function(x) { character(0) @@ -212,7 +227,7 @@ setMethod("c", signature(x = "Pipeline"), function(x, ...) { steps <- x@steps for (pipe in pipelines) { assert_is(pipe, "PipelineBase") - # If the step is a pipeline, combine the steps. Else, add the single step. + # If the step is a pipeline step, add the single step. Else, the step is a full pipeline and we want to move all the steps over. steps <- ifelse(is(pipe, "PipelineStep"), c(steps, pipe), c(steps, pipe@steps)) } @@ -229,7 +244,7 @@ setMethod("c", signature(x = "Pipeline"), function(x, ...) { return(new_pipeline) }) -#' S4 Class Representing a single transformer or predictor +#' PipelineBase representing a single step within a pipeline #' @slot step_name (character) Name of the step #' @slot fitted (logical) Whether the pipeline has been fitted #' @name PipelineStep @@ -240,7 +255,7 @@ setClass("PipelineStep", step_name = "character" ), prototype = list( - step_name = "" + step_name = character(0) ) ) @@ -262,6 +277,8 @@ setMethod("c", signature(x = "PipelineStep"), function(x, ...) { for (step in steps) { if (!step@fitted) fitted <- FALSE } + new_pipeline@fitted <- fitted + return(new_pipeline) }) @@ -271,17 +288,17 @@ setMethod("show", signature(object = "PipelineStep"), function(object) { }) -#' S4 Class representing an operation that transforms data, and holds fitted parameters +#' PipelineStep representing an operation that transforms data, and holds fitted parameters #' @slot step_name (character) Name of the step #' @slot fitted (logical) Whether the pipeline has been fitted -#' @details Transformers represent single operations (derived from the PipelineStep class) that transform data. +#' @details Transformers represent single operations (derived from the PipelineStep class) that project data from an IterableMatrix to another IterableMatrix/dgCMatrix. #' They can be fit to data within an IterableMatrix object, which will be used to hold the fitted parameters. -#' Using the transform method on a fitted transformer will apply the transformation to the data and return -#' the transformed data as an IterableMatrix object. -#' +#' Using the `project()`` method on a fitted transformer will apply the transformation to the data and return +#' the projected data as an IterableMatrix object, or a dgCMatrix. +#' #' These objects can be combined into a Pipeline object using the `c()` function, with other transformers, or estimators. #' Transformers can also be combined with full pipelines, to create a new pipeline object. -#' Derived classes should implement the `fit()`, `transform()`, and `short_description()` methods. +#' Derived classes should implement the `fit()`, `project()`, and `short_description()` methods. #' @name Transformer #' @export setClass("Transformer", @@ -289,87 +306,17 @@ setClass("Transformer", ) -#' Perform latent semantic indexing (LSI) on a matrix. -#' @name LSITransformer -#' @export -setClass("LSITransformer", - contains = "Transformer", - slots = list( - idf_ = "numeric", - svd_attr_ = "list", - z_score_norm = "logical", - n_dimensions = "integer", - scale_factor = "integer", - threads = "integer" - ), - prototype = list( - idf_ = numeric(0), - svd_attr_ = list(), - z_score_norm = FALSE, - n_dimensions = 20L, - scale_factor = 1e4L, - threads = 1L - ) -) - -#' Create a new LSITransformer object -#' @export -LSITransformer <- function(z_score_norm, n_dimensions, scale_factor, threads) { - return(new( - "LSITransformer", z_score_norm = z_score_norm, n_dimensions = n_dimensions, - scale_factor = scale_factor, threads = threads, step_name = "LSITransformer")) -} - -setMethod("fit", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { - ret <- lsi( - x, z_score_norm = object@z_score_norm, n_dimensions = object@n_dimensions, - scale_factor = object@scale_factor, threads = object@threads, - save_lsi = TRUE - ) - object@idf_ <- ret$idf - object@svd_attr_ <- ret$svd_attr - object@fitted <- TRUE - return(object) -}) - -setMethod("transform", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { - # rudimentary implementation -- Works but is duplicate code. - assert_true(object@fitted) - # Wait until LSI PR has been reviewed - npeaks <- colSums(x) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` - tf <- x %>% multiply_cols(1 / npeaks) - mat_tfidf <- tf %>% multiply_rows(object@idf_) - mat_log_tfidf <- log1p(object@scale_factor * mat_tfidf) - mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) - if (object@z_score_norm) { - cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = object@threads)$col_stats - cell_means <- cell_peak_stats["mean",] - cell_vars <- cell_peak_stats["variance",] - mat_log_tfidf <- mat_log_tfidf %>% - add_cols(-cell_means) %>% - multiply_cols(1 / cell_vars) - } - pca_res <- t(object@svd_attr_$u) %*% mat_log_tfidf - return(pca_res) -}) - -setMethod("short_description", "LSITransformer", function(x) { - return(sprintf("LSITransformer(z_score_norm=%s, n_dimensions=%d, scale_factor=%d, threads=%d)", - x@z_score_norm, x@n_dimensions, x@scale_factor, x@threads)) -}) - -setClass("VarFeatSelectorTransformer", - contains = "Transformer", - slots = list( - num_feats = "integer", - n_bins = "integer" - ) -) - - -#' S4 Class representing an operation that predicts data, and holds fitted parameters. +#' PipelineStep representing an operation that estimates data, and holds fitted parameters. #' @slot step_name (character) Name of the step #' @slot fitted (logical) Whether the pipeline has been fitted +#' @details Estimators represent single operations (derived from the PipelineStep class) that make predictions based on data given by an +#' IterableMatrix. Additionally, supervised estimators will require a target numeric, or character array to be provided during a `fit()` call. +#' Unsupervised estimators, on the other hand, do not require a target array. Following a `fit()` call, the estimator will hold the fitted parameters +#' such that data can be labeled using the `estimate()` method. +#' +#' Estimators can be combined into a Pipeline object using the `c()` function, with other transformers. Estimators are required to be the terminal step +#' within a pipeline. Estimators can also be combined with full pipelines, to create a new pipeline object. +#' Derived classes should implement the `fit()`, `estimate()`, and `short_description()` methods. #' @name Estimator #' @export setClass("Estimator", diff --git a/r/man/Estimator.Rd b/r/man/Estimator.Rd index 014f701c..346fbc83 100644 --- a/r/man/Estimator.Rd +++ b/r/man/Estimator.Rd @@ -1,11 +1,21 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R +% Please edit documentation in R/matrix_pipelines.R \docType{class} \name{Estimator} \alias{Estimator} -\title{S4 Class representing an operation that predicts data, and holds fitted parameters.} +\title{PipelineStep representing an operation that estimates data, and holds fitted parameters.} \description{ -S4 Class representing an operation that predicts data, and holds fitted parameters. +PipelineStep representing an operation that estimates data, and holds fitted parameters. +} +\details{ +Estimators represent single operations (derived from the PipelineStep class) that make predictions based on data given by an +IterableMatrix. Additionally, supervised estimators will require a target numeric, or character array to be provided during a \code{fit()} call. +Unsupervised estimators, on the other hand, do not require a target array. Following a \code{fit()} call, the estimator will hold the fitted parameters +such that data can be labeled using the \code{estimate()} method. + +Estimators can be combined into a Pipeline object using the \code{c()} function, with other transformers. Estimators are required to be the terminal step +within a pipeline. Estimators can also be combined with full pipelines, to create a new pipeline object. +Derived classes should implement the \code{fit()}, \code{estimate()}, and \code{short_description()} methods. } \section{Slots}{ diff --git a/r/man/LSITransformer.Rd b/r/man/LSITransformer.Rd deleted file mode 100644 index dbc15831..00000000 --- a/r/man/LSITransformer.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R -\docType{class} -\name{LSITransformer} -\alias{LSITransformer} -\title{Perform latent semantic indexing (LSI) on a matrix.} -\usage{ -LSITransformer(z_score_norm, n_dimensions, scale_factor, threads) -} -\description{ -Perform latent semantic indexing (LSI) on a matrix. - -Create a new LSITransformer object -} diff --git a/r/man/Pipeline.Rd b/r/man/Pipeline.Rd index c589a33f..efed722b 100644 --- a/r/man/Pipeline.Rd +++ b/r/man/Pipeline.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R +% Please edit documentation in R/matrix_pipelines.R \docType{class} \name{Pipeline} \alias{Pipeline} @@ -16,7 +16,12 @@ A new Pipeline object. \description{ S4 class for combining multiple pipeline steps into a single pipeline -Return a new Pipeline object +Return a new Pipeline object. +} +\details{ +Creating a pipeline object can be done by passing a list of pipeline steps to the constructor. +Creation only expects that all steps make logical sense. i.e., the final step can be either an Estimator or a Transformer, +but each intermediate step cannot be an Estimator. } \section{Slots}{ diff --git a/r/man/PipelineBase.Rd b/r/man/PipelineBase.Rd deleted file mode 100644 index bccad74a..00000000 --- a/r/man/PipelineBase.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R -\docType{class} -\name{PipelineBase} -\alias{PipelineBase} -\title{Pipeline Base Class} -\description{ -Pipeline Base Class -} -\section{Slots}{ - -\describe{ -\item{\code{fitted}}{(logical) Whether the pipeline has been fitted} -}} - diff --git a/r/man/PipelineStep.Rd b/r/man/PipelineStep.Rd index 6bb56b0a..a54db9ae 100644 --- a/r/man/PipelineStep.Rd +++ b/r/man/PipelineStep.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R +% Please edit documentation in R/matrix_pipelines.R \docType{class} \name{PipelineStep} \alias{PipelineStep} -\title{S4 Class Representing a single transformer or predictor} +\title{PipelineBase representing a single step within a pipeline} \description{ -S4 Class Representing a single transformer or predictor +PipelineBase representing a single step within a pipeline } \section{Slots}{ diff --git a/r/man/Transformer.Rd b/r/man/Transformer.Rd index 1b117cff..ce341fdf 100644 --- a/r/man/Transformer.Rd +++ b/r/man/Transformer.Rd @@ -1,21 +1,21 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R +% Please edit documentation in R/matrix_pipelines.R \docType{class} \name{Transformer} \alias{Transformer} -\title{S4 Class representing an operation that transforms data, and holds fitted parameters} +\title{PipelineStep representing an operation that transforms data, and holds fitted parameters} \description{ -S4 Class representing an operation that transforms data, and holds fitted parameters +PipelineStep representing an operation that transforms data, and holds fitted parameters } \details{ -Transformers represent single operations (derived from the PipelineStep class) that transform data. +Transformers represent single operations (derived from the PipelineStep class) that project data from an IterableMatrix to another IterableMatrix/dgCMatrix. They can be fit to data within an IterableMatrix object, which will be used to hold the fitted parameters. -Using the transform method on a fitted transformer will apply the transformation to the data and return -the transformed data as an IterableMatrix object. +Using the `project()`` method on a fitted transformer will apply the transformation to the data and return +the projected data as an IterableMatrix object, or a dgCMatrix. These objects can be combined into a Pipeline object using the \code{c()} function, with other transformers, or estimators. Transformers can also be combined with full pipelines, to create a new pipeline object. -Derived classes should implement the \code{fit()}, \code{transform()}, and \code{short_description()} methods. +Derived classes should implement the \code{fit()}, \code{project()}, and \code{short_description()} methods. } \section{Slots}{ diff --git a/r/man/c-open-paren-PipelineBase-close-paren.Rd b/r/man/c-open-paren-PipelineBase-close-paren.Rd index efe4cd9c..09d8b7a7 100644 --- a/r/man/c-open-paren-PipelineBase-close-paren.Rd +++ b/r/man/c-open-paren-PipelineBase-close-paren.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R +% Please edit documentation in R/matrix_pipelines.R \name{c(PipelineBase)} \alias{c(PipelineBase)} \title{Combine pipeline objects, to create a new pipeline object.} diff --git a/r/man/call_macs_peaks.Rd b/r/man/call_macs_peaks.Rd new file mode 100644 index 00000000..5ad425c5 --- /dev/null +++ b/r/man/call_macs_peaks.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/atac_utils.R +\name{call_macs_peaks} +\alias{call_macs_peaks} +\title{Call peaks using MACS2/3} +\usage{ +call_macs_peaks(...) +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} + +This function has been renamed to \code{call_peaks_macs()} +} +\keyword{internal} diff --git a/r/man/estimate-open-paren-PipelineBase-IterableMatrix-close-paren.Rd b/r/man/estimate-open-paren-PipelineBase-IterableMatrix-close-paren.Rd new file mode 100644 index 00000000..755e3fba --- /dev/null +++ b/r/man/estimate-open-paren-PipelineBase-IterableMatrix-close-paren.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_pipelines.R +\name{estimate(PipelineBase,IterableMatrix)} +\alias{estimate(PipelineBase,IterableMatrix)} +\title{Estimate predictions on the output data using a fitted pipeline} +\usage{ +estimate(object, x, ...) +} +\arguments{ +\item{object}{(PipelineBase) The fitted pipeline object. Either the final step is an Estimator, or the pipeline is a single Estimator.} + +\item{x}{(IterableMatrix) Input data to be estimated on} +} +\value{ +Predicted output labels +} +\description{ +Estimate predictions on the output data using a fitted pipeline +} diff --git a/r/man/fit-open-paren-PipelineBase-IterableMatrix-close-paren.Rd b/r/man/fit-open-paren-PipelineBase-IterableMatrix-close-paren.Rd index b722ea85..c4faf066 100644 --- a/r/man/fit-open-paren-PipelineBase-IterableMatrix-close-paren.Rd +++ b/r/man/fit-open-paren-PipelineBase-IterableMatrix-close-paren.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R +% Please edit documentation in R/matrix_pipelines.R \name{fit(PipelineBase,IterableMatrix)} \alias{fit(PipelineBase,IterableMatrix)} -\title{Fit the pipeline object to data} +\title{Fit a pipeline object to data} \usage{ fit(object, x, y = NULL, ...) } @@ -11,11 +11,19 @@ fit(object, x, y = NULL, ...) \item{x}{(IterableMatrix) Input data to be fitted on.} -\item{y}{Optional output data to be fitted on. Required if the final step is an Estimator, else ignored.} +\item{y}{Optional output data to be fitted on. Required if the final step is a supervised Estimator, else ignored.} } \value{ The fitted pipeline object. } \description{ -Fit the pipeline object to data +Fit a pipeline object to data +} +\details{ +The \code{fit()} method is used to fit a pipeline object to data and a potential label output. Within single estimators, the \code{fit()} method only +takes the input data to be fitted on. Within pipelines, the \code{fit()} method sequentially fits the transformers on each non-terminal step of the pipeline. More specifically, +The input data is transformed by each transformer, and used to fit the next transformer in the pipeline. If the final step is an estimator, the input IterableMatrix +and label (if supervised) are used to fit the estimator. + +The fitted pipeline object is returned, allowing for projection of new data. } diff --git a/r/man/predict-open-paren-PipelineBase-IterableMatrix-close-paren.Rd b/r/man/predict-open-paren-PipelineBase-IterableMatrix-close-paren.Rd deleted file mode 100644 index 98f603b9..00000000 --- a/r/man/predict-open-paren-PipelineBase-IterableMatrix-close-paren.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R -\name{predict(PipelineBase,IterableMatrix)} -\alias{predict(PipelineBase,IterableMatrix)} -\title{Predict the output data using a fitted pipeline} -\usage{ -predict(object, x, ...) -} -\arguments{ -\item{object}{(PipelineBase) The fitted pipeline object} - -\item{x}{(IterableMatrix) Input data to be predicted} -} -\value{ -Predicted output data -} -\description{ -Predict the output data using a fitted pipeline -} diff --git a/r/man/project-open-paren-PipelineBase-IterableMatrix-close-paren.Rd b/r/man/project-open-paren-PipelineBase-IterableMatrix-close-paren.Rd new file mode 100644 index 00000000..ffa6c5c4 --- /dev/null +++ b/r/man/project-open-paren-PipelineBase-IterableMatrix-close-paren.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_pipelines.R +\name{project(PipelineBase,IterableMatrix)} +\alias{project(PipelineBase,IterableMatrix)} +\title{Project input data using a fitted pipeline} +\usage{ +project(object, x, ...) +} +\arguments{ +\item{object}{(PipelineBase) A fitted pipeline object} + +\item{x}{(IterableMatrix) Input data to be transformed} +} +\value{ +Data projected by the pipeline +} +\description{ +Project input data using a fitted pipeline +} diff --git a/r/man/transform-open-paren-PipelineBase-IterableMatrix-close-paren.Rd b/r/man/transform-open-paren-PipelineBase-IterableMatrix-close-paren.Rd deleted file mode 100644 index 5187c12c..00000000 --- a/r/man/transform-open-paren-PipelineBase-IterableMatrix-close-paren.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/matrix_project_predict.R -\name{transform(PipelineBase,IterableMatrix)} -\alias{transform(PipelineBase,IterableMatrix)} -\title{Transform the input data using a fitted pipeline} -\usage{ -transform(object, x, ...) -} -\arguments{ -\item{object}{(PipelineBase) The fitted pipeline object} - -\item{x}{(IterableMatrix) Input data to be transformed} -} -\value{ -Data transformed by the pipeline -} -\description{ -Transform the input data using a fitted pipeline -} diff --git a/r/man/write_insertion_bed.Rd b/r/man/write_insertion_bed.Rd index e4e4a664..ad190e13 100644 --- a/r/man/write_insertion_bed.Rd +++ b/r/man/write_insertion_bed.Rd @@ -8,7 +8,7 @@ write_insertion_bed( fragments, path, cell_groups = rlang::rep_along(cellNames(fragments), "all"), - insertion_mode = c("start_only", "both", "end_only"), + insertion_mode = c("both", "start_only", "end_only"), verbose = FALSE, threads = 1 ) diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 04570ff9..038b1cdb 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -180,7 +180,6 @@ reference: - subtitle: "Pipeline S4 Classes" - desc: Base class for pipeline objects - contents: - - PipelineBase - PipelineStep - Pipeline - Transformer @@ -190,11 +189,6 @@ reference: - subtitle: "Pipeline Methods" - contents: - fit(PipelineBase,IterableMatrix) - - predict(PipelineBase,IterableMatrix) - - transform(PipelineBase,IterableMatrix) + - estimate(PipelineBase,IterableMatrix) + - project(PipelineBase,IterableMatrix) - c(PipelineBase) - - -- subtitle: "Transformers" -- contents: - - LSITransformer From 6fd6f983f0e9cf03c4f5958f6caea0d708f6b448 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 18 Nov 2024 00:15:51 -0800 Subject: [PATCH 3/5] [r] update docstrings for pipelines --- r/R/matrix_pipelines.R | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/r/R/matrix_pipelines.R b/r/R/matrix_pipelines.R index 8571daed..badbca12 100644 --- a/r/R/matrix_pipelines.R +++ b/r/R/matrix_pipelines.R @@ -41,10 +41,12 @@ setMethod("fit", signature(object = "PipelineBase", x = "IterableMatrix"), funct stop("fit() method not implemented for PipelineBase") }) -#' Project input data using a fitted pipeline +#' Project input data using a fitted pipeline or pipeline step. #' @param object (PipelineBase) A fitted pipeline object #' @param x (IterableMatrix) Input data to be transformed -#' @return Data projected by the pipeline +#' @return (IterableMatrix) of data transformed by a fitted pipeline or pipeline step. +#' @details Projecting data using a pipeline or pipeline step requires the pipeline to be fitted to data first. Therefore, +#' the `fit()` method must be executed prior to projecting any data. #' @name project(PipelineBase,IterableMatrix) #' @export setGeneric("project", function(object, x, ...) standardGeneric("project")) @@ -56,7 +58,9 @@ setMethod("project", signature(object = "PipelineBase", x = "IterableMatrix"), f #' Estimate predictions on the output data using a fitted pipeline #' @param object (PipelineBase) The fitted pipeline object. Either the final step is an Estimator, or the pipeline is a single Estimator. #' @param x (IterableMatrix) Input data to be estimated on -#' @return Predicted output labels +#' @return Predicted output labels as an array. +#' @details Estimation of input data using a pipeline or pipeline step requires the pipeline to be fitted to data first. Therefore, +#' the `fit()` method must be executed prior to projecting any data. #' @name estimate(PipelineBase,IterableMatrix) #' @export setGeneric("estimate", function(object, x, ...) standardGeneric("estimate")) @@ -73,6 +77,12 @@ setMethod("estimate", signature(object = "PipelineBase", x = "IterableMatrix"), setMethod("c", signature(x = "PipelineBase"), function(x, ...) { stop("c() method not implemented for PipelineBase") }) + +#' Print how to recreate the pipeline object. +#' @details Utilizes the `short_description()` method to print the steps of the pipeline object. +#' If the object is a pipeline, it will print the steps of the pipeline, demonstrating how to recreate the pipeline with a function call. +#' If the object is a pipeline step, it will print the step name. +#' @param object (PipelineBase) The pipeline object to describe. setMethod("show", signature(object = "PipelineBase"), function(object) { stop("show() method not implemented for PipelineBase") }) @@ -92,11 +102,15 @@ setClass( ) #' Return a new Pipeline object. -#' @param steps A list of ordered steps to be executed in the pipeline. +#' @param steps A list of ordered steps of operations to be converted into a pipeline object. #' @return A new Pipeline object. -#' @details Creating a pipeline object can be done by passing a list of pipeline steps to the constructor. +#' @details Pipeline objects represent multiple pipeline steps that are to be executed on input data sequentially. +#' Creating a pipeline object can be done by passing a list of pipeline steps to the constructor. #' Creation only expects that all steps make logical sense. i.e., the final step can be either an Estimator or a Transformer, #' but each intermediate step cannot be an Estimator. +#' +#' If the steps are all previously fitted, then the overall pipeline is considered fit, and can be used to project/estimate on input data. +#' Otherwise, the pipeline must be fitted before projecting/estimating on input data. #' @export Pipeline <- function(steps = list()) { # Check if all steps are transformers, with the final step being either an estimator or a transformer @@ -216,10 +230,10 @@ setMethod("show", signature(object = "Pipeline"), function(object) { cat(")\n") }) -#' Add steps to a pipeline, where the first argument is the pipeline object and the rest are the steps to add. -#' Requires for every additional step to be a pipeline object -#' @param x (Pipeline) The PipelineBase object to add steps to -#' @param ... (PipelineBase) The steps to add to the pipeline +#' Add steps to a pipeline, where the first argument is the pipeline object and the rest are either pipeline steps, or full pipelines to add in order. +#' Requires for every additional step to be a pipeline object. +#' @param x (Pipeline) The PipelineBase object to add steps to. +#' @param ... (PipelineBase) The steps to add to the pipeline. #' @noRd #' @export setMethod("c", signature(x = "Pipeline"), function(x, ...) { From a138e57b3ab376cd33ec1c6c2df31061f14215cb Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 18 Nov 2024 13:34:58 -0800 Subject: [PATCH 4/5] [r] add more pipeline validity checks, add `estimate()` for pipelines --- r/R/matrix_pipelines.R | 86 ++++++++++++++++++-------------- r/man/check_pipeline_validity.Rd | 15 ++++++ 2 files changed, 64 insertions(+), 37 deletions(-) create mode 100644 r/man/check_pipeline_validity.Rd diff --git a/r/R/matrix_pipelines.R b/r/R/matrix_pipelines.R index badbca12..71e3472e 100644 --- a/r/R/matrix_pipelines.R +++ b/r/R/matrix_pipelines.R @@ -126,28 +126,33 @@ Pipeline <- function(steps = list()) { return(new("Pipeline", steps = steps)) } -#' Fit the pipeline object to data -#' @param object (Pipeline) The pipeline object to fit. -#' @param x (IterableMatrix) Input data to be fitted on. -#' @param y Optional output data to be fitted on. Required if the final step is an Estimator, else ignored. -#' @return The fitted pipeline object. -#' @noRd -#' @export -setMethod("fit", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, y = NULL, ...) { + +#' Check if all steps are transformers, with the final step being either an estimator or a transformer +#' @param object (Pipeline) The pipeline object to check +#' @keywords internal +check_pipeline_validity <- function(object) { steps <- object@steps - # Check if all steps are transformers, with the final step being either an estimator or a transformer for (i in seq_along(steps)) { step <- steps[[i]] # allow to fit with estimators as well if (i < length(steps)) { - assert_is(step, "PipelineStep") + assert_is(step, "Transformer") } else { assert_is(step, c("PipelineStep")) - if (!is.null(y)) { - assert_is(step, "Estimator") - } } } +} + +#' Fit the pipeline object to data +#' @param object (Pipeline) The pipeline object to fit. +#' @param x (IterableMatrix) Input data to be fitted on. +#' @param y Optional output data to be fitted on. Required if the final step is an Estimator, else ignored. +#' @return The fitted pipeline object. +#' @noRd +#' @export +setMethod("fit", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, y = NULL, ...) { + check_pipeline_validity(object) + steps <- object@steps # Fit every step in the pipeline for (i in seq_along(steps)) { step <- steps[[i]] @@ -175,37 +180,42 @@ setMethod("fit", signature(object = "Pipeline", x = "IterableMatrix"), function( #' @export setMethod("project", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, ...) { if (!object@fitted) stop("Pipeline must be fitted before projecting") + check_pipeline_validity(object) + # check if final step is an estimator + if (is(object@steps[[length(object@steps)]], "Estimator")) stop("The final step must be a transformer instead of an estimator. Please use the `estimate()` method instead.") steps <- object@steps for (step in steps) { - if (is(step, "Transformer")) x <- project(step, x) - # Some actions convert matrices to a different type, so we need to convert back to IterableMatrix - # for following steps + x <- project(step, x) + # Some actions convert matrices to a different type, + # so we need to convert back to IterableMatrix for following steps if (is(x, "dgCMatrix")) x <- as(x, "IterableMatrix") } return(x) }) -# #' Estimate predictions on the output data using a fitted pipeline -# #' @param object (Pipeline) The fitted pipeline object -# #' @param x (IterableMatrix) Input data to be estimated on -# #' @noRd -# #' @export -# setMethod("estimate", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, ...) { -# if (!object@fitted) stop("Pipeline must be fitted before estimating") -# steps <- object@steps -# for (i in seq_along(steps)) { -# step <- steps[[i]] -# if (i < n_steps) { -# x <- project(step, x) -# } else if (is(step, "Estimator")) { -# y_pred <- estimate(step, x) -# return(y_pred) -# } else { -# stop("The final step must be an estimator with a estimate method") -# } -# } -# }) +#' Estimate predictions on the output data using a fitted pipeline +#' @param object (Pipeline) The fitted pipeline object +#' @param x (IterableMatrix) Input data to be estimated on +#' @noRd +#' @export +setMethod("estimate", signature(object = "Pipeline", x = "IterableMatrix"), function(object, x, ...) { + if (!object@fitted) stop("Pipeline must be fitted before estimating") + check_pipeline_validity(object) + if (!is(object@steps[[length(object@steps)]], "Estimator")) stop("The final step must be an estimator with a estimate method. Please use the `project()` method instead.") + steps <- object@steps + for (i in seq_along(steps)) { + step <- steps[[i]] + if (i < n_steps) { + x <- project(step, x) + } else if (is(step, "Estimator")) { + y_pred <- estimate(step, x) + return(y_pred) + } else { + stop("The final step must be an estimator with a estimate method") + } + } +}) setMethod("short_description", "Pipeline", function(x) { character(0) @@ -244,10 +254,11 @@ setMethod("c", signature(x = "Pipeline"), function(x, ...) { # If the step is a pipeline step, add the single step. Else, the step is a full pipeline and we want to move all the steps over. steps <- ifelse(is(pipe, "PipelineStep"), c(steps, pipe), c(steps, pipe@steps)) } - + # If all the steps are fitted, the pipeline overall is fitted. # We trust the user to have fitted the pipelines with the same data new_pipeline <- Pipeline(steps = steps) + check_pipeline_validity(new_pipeline) fitted <- TRUE for (step in steps) { if (!step@fitted) { @@ -287,6 +298,7 @@ setMethod("c", signature(x = "PipelineStep"), function(x, ...) { steps <- ifelse(is(pipe, "PipelineStep"), c(steps, pipe), c(steps, pipe@steps)) } new_pipeline <- Pipeline(steps = steps) + check_pipeline_validity(new_pipeline) fitted <- TRUE for (step in steps) { if (!step@fitted) fitted <- FALSE diff --git a/r/man/check_pipeline_validity.Rd b/r/man/check_pipeline_validity.Rd new file mode 100644 index 00000000..4e314773 --- /dev/null +++ b/r/man/check_pipeline_validity.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_pipelines.R +\name{check_pipeline_validity} +\alias{check_pipeline_validity} +\title{Check if all steps are transformers, with the final step being either an estimator or a transformer} +\usage{ +check_pipeline_validity(object) +} +\arguments{ +\item{object}{(Pipeline) The pipeline object to check} +} +\description{ +Check if all steps are transformers, with the final step being either an estimator or a transformer +} +\keyword{internal} From 7eb814b2c91ed6b7fc0842b930aaa40e81809e94 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 18 Nov 2024 15:28:06 -0800 Subject: [PATCH 5/5] [r] fix pipeline concatenation --- r/R/matrix_pipelines.R | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/r/R/matrix_pipelines.R b/r/R/matrix_pipelines.R index 71e3472e..ea43c7f5 100644 --- a/r/R/matrix_pipelines.R +++ b/r/R/matrix_pipelines.R @@ -227,11 +227,11 @@ setMethod("short_description", "Pipeline", function(x) { #' @export setMethod("show", signature(object = "Pipeline"), function(object) { fitted <- ifelse(object@fitted, "Fitted", "Unfitted") - cat(fitted, " Pipeline with steps:\n") + cat(fitted, "Pipeline with steps:\n") cat("Pipeline(\n") for (i in seq_along(object@steps)) { step <- object@steps[[i]] - cat("\t", short_description(step)) + cat(" ", short_description(step)) if (i < length(object@steps)) { cat(",") } @@ -252,9 +252,16 @@ setMethod("c", signature(x = "Pipeline"), function(x, ...) { for (pipe in pipelines) { assert_is(pipe, "PipelineBase") # If the step is a pipeline step, add the single step. Else, the step is a full pipeline and we want to move all the steps over. - steps <- ifelse(is(pipe, "PipelineStep"), c(steps, pipe), c(steps, pipe@steps)) + if (is(pipe, "PipelineStep")) { + steps <- c(steps, pipe) + } else { + steps <- c(steps, pipe@steps) + } } +<<<<<<< Updated upstream +======= +>>>>>>> Stashed changes # If all the steps are fitted, the pipeline overall is fitted. # We trust the user to have fitted the pipelines with the same data new_pipeline <- Pipeline(steps = steps) @@ -295,7 +302,11 @@ setMethod("c", signature(x = "PipelineStep"), function(x, ...) { steps <- list(x) for (pipe in pipelines) { assert_is(pipe, "PipelineBase") - steps <- ifelse(is(pipe, "PipelineStep"), c(steps, pipe), c(steps, pipe@steps)) + if(is(pipe, "PipelineStep")) { + steps <- c(steps, pipe) + } else { + steps <- c(steps, pipe@steps) + } } new_pipeline <- Pipeline(steps = steps) check_pipeline_validity(new_pipeline)