diff --git a/DESCRIPTION b/DESCRIPTION index a5d027cc..d2cc10b3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -78,6 +78,7 @@ Collate: utils.R misc.R SparseList-class.R MIndex-class.R + MIndexList-class.R lowlevel-matching.R match-utils.R matchPattern.R diff --git a/NAMESPACE b/NAMESPACE index ba1e3bfa..b100250c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -221,7 +221,7 @@ exportMethods( exportClasses( #SparseList, - MIndex, ByPos_MIndex, + MIndex, ByPos_MIndex, MIndexList, PreprocessedTB, Twobit, ACtree2, PDict3Parts, PDict, TB_PDict, MTB_PDict, Expanded_TB_PDict diff --git a/R/MIndexList-class.R b/R/MIndexList-class.R new file mode 100644 index 00000000..89b04438 --- /dev/null +++ b/R/MIndexList-class.R @@ -0,0 +1,76 @@ +### ========================================================================= +### MIndexList objects +### ------------------------------------------------------------------------- +### + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### The "MIndexList" class. +### +### This class serves as a base class to hold collections of MIndex objects. +### +### As with MIndex, in normal operations, the user should never need to create +### MIndex objects directly or to modify existing ones. Those objects are +### typically returned by a sequence matching/alignment function like +### vmatchPDict(). For this reason, coercion methods are relatively barebones +### and will throw errors rather than try to correct +### + +setClass("MIndexList", + contains="CompressedList", + representation( +# "VIRTUAL", + unlistData="MIndex" + ), + prototype( + elementType="MIndex" + ) +) + + +.from_list_to_MIndexList <- function(from) +{ + ## keeping ans_class variable for potential future extensions + ans_class <- "MIndexList" + + ## ensure elements are all MIndex objects + if(!all(vapply(from, inherits, logical(1L), what="MIndex"))) + stop("Only lists of MIndex objects can be coerced to MIndexList") + IRanges:::new_CompressedList_from_list(ans_class, from) +} + +## barebones, but users shouldn't be calling this directly +## expecting to be used internally, list coercion provided for QoL +setAs("list", "MIndexList", function(from) .from_list_to_MIndexList(from)) +setAs("List", "MIndexList", function(from) .from_list_to_MIndexList(as.list(from))) + +.from_MIndexList_to_compact_char <- function(object){ + lapply(object, \(m_ind){ + paste(length(m_ind), "patterns,", + sum(lengths(m_ind)), "matches") + }) +} + +setMethod("show", "MIndexList", + function(object) { + lo <- length(object) + k <- min(5, length(object)) + diffK <- lo - 5 + cat(classNameForDisplay(object), " of length ", lo, + "\n", sep = "") + + repr <- .from_MIndexList_to_compact_char(head(object, k)) + IRanges:::.showAtomicList(CharacterList(repr), minLines=10L) + if (diffK > 0) + cat("...\n<", diffK, + ifelse(diffK == 1, + " more element>\n", " more elements>\n"), + sep="") + } +) + +setMethod("showAsCell", "MIndexList", + function(object){ + repr <- .from_MIndexList_to_compact_char(object) + showAsCell(CharacterList(repr)) + } +) diff --git a/R/matchPDict.R b/R/matchPDict.R index c7324abe..1f1a9381 100644 --- a/R/matchPDict.R +++ b/R/matchPDict.R @@ -15,7 +15,7 @@ ### > library(BSgenome.Hsapiens.UCSC.hg18) ### > chr1 <- Hsapiens$chr1 ### > system.time(end_index <- endIndex(matchPDict(pdict, chr1))) -### user system elapsed +### user system elapsed ### 50.663 0.000 50.763 ### > count_index <- sapply(end_index, length) ### > table(count_index) @@ -64,8 +64,8 @@ ### 1M 36 2.5 sec 717M 106 sec 103 sec 0 ### 10M 36 56 sec 6724M 351 sec 200 sec 0 ### 10M 12 7.5 sec 340M 227 sec 216 sec 100M -### 30M 12 27 sec 523M 491 sec ? -### +### 30M 12 27 sec 523M 491 sec ? +### ### III. Inexact matching ### --------------------- ### pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2) @@ -561,10 +561,8 @@ if (!identical(weight, 1L)) warning("'weight' is ignored when 'collapse=FALSE'") } - } else { - ## vmatchPDict() - stop("vmatchPDict() is not supported yet, sorry") } + ## We are doing our own dispatch here, based on the type of 'pdict'. ## TODO: Revisit this. Would probably be a better design to use a ## generic/methods approach and rely on the standard dispatch mechanism. @@ -584,7 +582,8 @@ max.mismatch, min.mismatch, with.indels, fixed, algorithm, collapse, weight, verbose, matches.as) - if (is.null(which_pp_excluded)) + if (is.null(which_pp_excluded) && matches.as %in% + c("MATCHES_AS_WHICH", "MATCHES_AS_COUNTS")) return(ans) if (matches.as == "MATCHES_AS_WHICH") { ## vwhichPDict() @@ -599,8 +598,51 @@ } return(ans) } - ## vmatchPDict() - stop("vmatchPDict() is not supported yet, sorry") + if(matches.as == "MATCHES_AS_ENDS") { + ## vmatchPDict() by ends + ## converting all the results to MIndex objects and then wrapping the result + for(i in seq_along(ans)) { + ans[[i]] <- new("ByPos_MIndex", + width0=width(pdict), + NAMES=names(pdict), + ends=ans[[i]]) + } + + names(ans) <- names(subject) + ans <- as(ans, "MIndexList") + return(ans) + } + if(matches.as == "MATCHES_AS_RANGES") { + ## vmatchPDict() by ranges + ## This is the same as MATCHES_AS_ENDS, except ans[[i]] holds two lists + ## - The first list has the start position for each range + ## - The second list has the width of the range (including the start) + + for(i in seq_along(ans)){ + ## First we have to move from starts to ends to align with the + ## Pos_MIndex constructor + ans_start <- ans[[i]][[1]] + ans_width <- ans[[i]][[2]] + for(j in seq_along(ans_start)){ + if(!is.null(ans_start[[j]])){ + ## Note that the width includes the start, so we subtract 1 + ans_start[[j]] <- ans_start[[j]] + ans_width[[j]] - 1L + } + } + ## Now ans_start holds the **end** position + ## width(pdict) will be equivalent to ifelse(is.null(w), 0, w[1]) for + ## w in ans_width + ans[[i]] <- new("ByPos_MIndex", + width0=width(pdict), + NAMES=names(pdict), + ends=ans_start) + } + + names(ans) <- names(subject) + ans <- as(ans, "MIndexList") + return(ans) + } + warning("Failed to post-process output of .vmatch(pdict, subject)") return(ans) } @@ -771,8 +813,8 @@ setMethod("whichPDict", "MaskedXString", ### 'lapply(subject, function(s) whichPDict(pdict, s, ...))'. ### The returned object is a list of length N. ### o vcountPDict(): returns an M x N matrix of integers. -### o vmatchPDict(): not supported yet! (first we need a container to -### store the results) +### o vmatchPDict(): returns an MIndexList (CompressedList of MIndex objects) +### This contains N MIndex objects, each with up to M entries ### setGeneric("vmatchPDict", signature="subject", @@ -787,7 +829,10 @@ setMethod("vmatchPDict", "ANY", function(pdict, subject, max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, algorithm="auto", verbose=FALSE) - stop("vmatchPDict() is not ready yet, sorry") + .vmatchPDict(pdict, subject, + max.mismatch, min.mismatch, with.indels, fixed, + algorithm, collapse=FALSE, weight=1L, + verbose, matches.as="MATCHES_AS_ENDS") ) ### Dispatch on 'subject' (see signature of generic). diff --git a/man/matchPDict-exact.Rd b/man/matchPDict-exact.Rd index 983bcec5..035128d6 100644 --- a/man/matchPDict-exact.Rd +++ b/man/matchPDict-exact.Rd @@ -55,9 +55,10 @@ and \code{whichPDict} returns the "who" information i.e. which patterns in the input dictionary have at least one match. - \code{vcountPDict} and \code{vwhichPDict} are vectorized versions - of \code{countPDict} and \code{whichPDict}, respectively, that is, - they work on a set of reference sequences in a vectorized fashion. + \code{vcountPDict}, \code{vwhichPDict}, and \code{vmatchPDict} are vectorized + versions of \code{countPDict}, \code{whichPDict}, and \code{matchPDict} + respectively, that is, they work on a set of reference sequences in a + vectorized fashion. This man page shows how to use these functions (aka the \code{*PDict} functions) for exact matching of a constant width dictionary i.e. @@ -86,6 +87,9 @@ vcountPDict(pdict, subject, vwhichPDict(pdict, subject, max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, algorithm="auto", verbose=FALSE) +vmatchPDict(pdict, subject, + max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, + algorithm="auto", verbose=FALSE) } \arguments{ @@ -111,7 +115,7 @@ vwhichPDict(pdict, subject, \code{whichPDict}. An \link{XStringSet} object containing the subject sequences - for \code{vcountPDict} and \code{vwhichPDict}. + for \code{vmatchPDict}, \code{vcountPDict}, \and code{vwhichPDict}. If \code{pdict} is a \link{PDict} object (i.e. a preprocessed dictionary), then \code{subject} must be of base class \link{DNAString}. @@ -210,6 +214,10 @@ vwhichPDict(pdict, subject, (see above for the details). \code{vwhichPDict} returns a list of \code{N} integer vectors. + + \code{vmatchPDict} returns an \link{MIndexList}, which is essentially a list + of length \code{N} where each entry is an \link{MIndex} object (as would be + obtained from running \code{matchPDict} on each of the sequences). } \author{H. Pagès} @@ -513,7 +521,7 @@ if (interactive()) { pdict2 <- PDict(probes2) ## Get the mapping from probes1 to probes2 (based on exact matching): - map1to2 <- vwhichPDict(pdict2, probes1) + map1to2 <- vwhichPDict(pdict2, probes1) ## The following helper function uses the probe level mapping to induce ## the mapping at the probe set IDs level (from hgu95av2 to hgu133a). diff --git a/src/match_pdict.c b/src/match_pdict.c index 6244e0b1..cade8820 100644 --- a/src/match_pdict.c +++ b/src/match_pdict.c @@ -480,6 +480,28 @@ static SEXP vcount_XStringSet_XStringSet(SEXP pattern, return ans; } +SEXP vmatch_PDict3Parts_XStringSet_helper(SEXP pptb, HeadTail *headtail, MatchPDictBuf *mpdb, + SEXP xstringset, + SEXP max_mismatch, SEXP min_mismatch, + SEXP fixed){ + XStringSet_holder subject; + int subj_length; + Chars_holder subj_elt; + + subject = _hold_XStringSet(xstringset); + subj_length = _get_length_from_XStringSet_holder(&subject); + + SEXP RETURN_LIST = PROTECT(NEW_LIST(subj_length)); + for(int i=0; imatches), R_NilValue)); + _MatchPDictBuf_flush(mpdb); + } + UNPROTECT(1); + return RETURN_LIST; +} + /* --- .Call ENTRY POINT --- */ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail, SEXP subject, @@ -495,10 +517,12 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail, matchpdict_buf = new_MatchPDictBuf_from_PDict3Parts(matches_as, pptb, pdict_head, pdict_tail); switch (matchpdict_buf.matches.ms_code) { - case MATCHES_AS_NULL: - error("vmatch_PDict3Parts_XStringSet() does not support " - "'matches_as=\"%s\"' yet, sorry", - CHAR(STRING_ELT(matches_as, 0))); + case MATCHES_AS_RANGES: + case MATCHES_AS_ENDS: + return vmatch_PDict3Parts_XStringSet_helper(pptb, &headtail, + &matchpdict_buf, subject, + max_mismatch, min_mismatch, + fixed); case MATCHES_AS_WHICH: return vwhich_PDict3Parts_XStringSet(pptb, &headtail, subject, @@ -510,8 +534,11 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail, max_mismatch, min_mismatch, fixed, collapse, weight, &matchpdict_buf); + default: + error("vmatch_PDict3Parts_XStringSet() does not support " + "'matches_as=\"%s\"' yet, sorry", + CHAR(STRING_ELT(matches_as, 0))); } - error("vmatchPDict() is not supported yet, sorry"); return R_NilValue; } @@ -545,7 +572,8 @@ SEXP vmatch_XStringSet_XStringSet(SEXP pattern, with_indels, fixed, algorithm, collapse, weight); } - error("vmatchPDict() is not supported yet, sorry"); + error("vmatchPDict() on two XStringSets is not supported yet. " + "Please use a PDict object instead (see `?PDict` for more info)."); return R_NilValue; } diff --git a/tests/testthat/test-MIndexListClass.R b/tests/testthat/test-MIndexListClass.R new file mode 100644 index 00000000..e69de29b diff --git a/tests/testthat/test-matchPDict.R b/tests/testthat/test-matchPDict.R index 722b7372..e6134467 100644 --- a/tests/testthat/test-matchPDict.R +++ b/tests/testthat/test-matchPDict.R @@ -75,7 +75,7 @@ test_that("inexact PDict matching works", { expect_equal(countPDict(pdict, d, max.mismatch=2), c(2,0,2,2)) }) -test_that("vcount/vwhich/vmatchPDict() work", { +test_that("vcount/vwhich/vmatchPDict work", { ## canary tests for future changes pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2) d <- DNAString("acggaccg") @@ -116,10 +116,6 @@ test_that("vcount/vwhich/vmatchPDict() work", { expect_equal(vcountPDict(pdict, dvv, max.mismatch=2), exp_out) expect_equal(vwhichPDict(pdict, dvv, max.mismatch=2), list(c(1,3,4),c(1,3,4))) - - ## this will fail when we implement it to remind me to add tests for it - expect_error2(vmatchPDict(pdict, dss), - "vmatchPDict\\(\\) is not ready yet") }) test_that("MIndex functionality works", { @@ -233,3 +229,56 @@ test_that("matchPDict() works for variable width dictionary", { } }) +test_that("vmatchPDict is equivalent to matchPDict output", { + pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2) + d <- DNAString("acggaccg") + d2 <- DNAString("gtcgtacgt") + dss <- DNAStringSet(list(d,d2)) + + mindex_obj_comp <- function(m1, m2){ + if(!is(m1, "MIndex") || !is(m2, "MIndex")){ + return(FALSE) + } + + if(length(m1) != length(m1)){ + return (FALSE) + } + + for(i in seq_along(m1)){ + l1 <- m1[[i]] + l2 <- m2[[i]] + for(attr_name in c("start", "width")){ + if(!identical(attr(l1, attr_name), attr(l2, attr_name))){ + return(FALSE) + } + } + } + return(TRUE) + } + + ## Basic matching + vmatch_output <- vmatchPDict(pdict, dss) + expect_true(mindex_obj_comp(vmatch_output[[1]], matchPDict(pdict, dss[[1]]))) + expect_true(mindex_obj_comp(vmatch_output[[2]], matchPDict(pdict, dss[[2]]))) + + ## max.mismatch + vmatch_output <- vmatchPDict(pdict, dss, max.mismatch = 2) + expect_true(mindex_obj_comp(vmatch_output[[1]], + matchPDict(pdict, dss[[1]], max.mismatch = 2))) + expect_true(mindex_obj_comp(vmatch_output[[2]], + matchPDict(pdict, dss[[2]], max.mismatch = 2))) + + ## min.mismatch + vmatch_output <- vmatchPDict(pdict, dss, min.mismatch = 1, max.mismatch = 2) + expect_true(mindex_obj_comp(vmatch_output[[1]], + matchPDict(pdict, dss[[1]], min.mismatch = 1, max.mismatch = 2))) + expect_true(mindex_obj_comp(vmatch_output[[2]], + matchPDict(pdict, dss[[2]], min.mismatch = 1, max.mismatch = 2))) + + ## fixed + vmatch_output <- vmatchPDict(pdict, dss, fixed = FALSE) + expect_true(mindex_obj_comp(vmatch_output[[1]], + matchPDict(pdict, dss[[1]], fixed = FALSE))) + expect_true(mindex_obj_comp(vmatch_output[[2]], + matchPDict(pdict, dss[[2]], fixed = FALSE))) +})