Bioconductor · ahl27 · Jan 30, 2025 · Feb 13, 2025 · Jan 28, 2026 · Jan 28, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -78,6 +78,7 @@ Collate: utils.R
 	misc.R
 	SparseList-class.R
 	MIndex-class.R
+	MIndexList-class.R
 	lowlevel-matching.R
 	match-utils.R
 	matchPattern.R

diff --git a/NAMESPACE b/NAMESPACE
@@ -221,7 +221,7 @@ exportMethods(
 
 exportClasses(
     #SparseList,
-    MIndex, ByPos_MIndex,
+    MIndex, ByPos_MIndex, MIndexList,
     PreprocessedTB, Twobit, ACtree2,
     PDict3Parts,
     PDict, TB_PDict, MTB_PDict, Expanded_TB_PDict

diff --git a/R/MIndexList-class.R b/R/MIndexList-class.R
@@ -0,0 +1,76 @@
+### =========================================================================
+### MIndexList objects
+### -------------------------------------------------------------------------
+###
+
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+### The "MIndexList" class.
+###
+### This class serves as a base class to hold collections of MIndex objects.
+###
+### As with MIndex, in normal operations, the user should never need to create
+### MIndex objects directly or to modify existing ones. Those objects are
+### typically returned by a sequence matching/alignment function like
+### vmatchPDict(). For this reason, coercion methods are relatively barebones
+### and will throw errors rather than try to correct
+###
+
+setClass("MIndexList",
+    contains="CompressedList",
+    representation(
+#        "VIRTUAL",
+        unlistData="MIndex"
+    ),
+    prototype(
+        elementType="MIndex"
+    )
+)
+
+
+.from_list_to_MIndexList <- function(from)
+{
+    ## keeping ans_class variable for potential future extensions
+    ans_class <- "MIndexList"
+
+    ## ensure elements are all MIndex objects
+    if(!all(vapply(from, inherits, logical(1L), what="MIndex")))
+        stop("Only lists of MIndex objects can be coerced to MIndexList")
+    IRanges:::new_CompressedList_from_list(ans_class, from)
+}
+
+## barebones, but users shouldn't be calling this directly
+## expecting to be used internally, list coercion provided for QoL
+setAs("list", "MIndexList", function(from) .from_list_to_MIndexList(from))
+setAs("List", "MIndexList", function(from) .from_list_to_MIndexList(as.list(from)))
+
+.from_MIndexList_to_compact_char <- function(object){
+    lapply(object, \(m_ind){
+                paste(length(m_ind), "patterns,",
+                      sum(lengths(m_ind)), "matches")
+            })
+}
+
+setMethod("show", "MIndexList",
+            function(object) {
+                lo <- length(object)
+                k <- min(5, length(object))
+                diffK <- lo - 5
+                cat(classNameForDisplay(object), " of length ", lo,
+                  "\n", sep = "")
+
+                repr <- .from_MIndexList_to_compact_char(head(object, k))
+                IRanges:::.showAtomicList(CharacterList(repr), minLines=10L)
+                if (diffK > 0)
+                  cat("...\n<", diffK,
+                      ifelse(diffK == 1,
+                             " more element>\n", " more elements>\n"),
+                      sep="")
+            }
+)
+
+setMethod("showAsCell", "MIndexList",
+            function(object){
+                repr <- .from_MIndexList_to_compact_char(object)
+                showAsCell(CharacterList(repr))
+            }
+)
diff --git a/R/matchPDict.R b/R/matchPDict.R
@@ -15,7 +15,7 @@
 ###   > library(BSgenome.Hsapiens.UCSC.hg18)
 ###   > chr1 <- Hsapiens$chr1
 ###   > system.time(end_index <- endIndex(matchPDict(pdict, chr1)))
-###      user  system elapsed 
+###      user  system elapsed
 ###    50.663   0.000  50.763
 ###   > count_index <- sapply(end_index, length)
 ###   > table(count_index)
@@ -64,8 +64,8 @@
 ###       1M      36      2.5 sec    717M     106 sec     103 sec            0
 ###      10M      36       56 sec   6724M     351 sec     200 sec            0
 ###      10M      12      7.5 sec    340M     227 sec     216 sec         100M
-###      30M      12       27 sec    523M     491 sec           ? 
-### 
+###      30M      12       27 sec    523M     491 sec           ?
+###
 ### III. Inexact matching
 ### ---------------------
 ###   pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2)
@@ -561,10 +561,8 @@
             if (!identical(weight, 1L))
                 warning("'weight' is ignored when 'collapse=FALSE'")
         }
-    } else {
-        ## vmatchPDict()
-        stop("vmatchPDict() is not supported yet, sorry")
     }
+
     ## We are doing our own dispatch here, based on the type of 'pdict'.
     ## TODO: Revisit this. Would probably be a better design to use a
     ## generic/methods approach and rely on the standard dispatch mechanism.
@@ -584,7 +582,8 @@
                        max.mismatch, min.mismatch, with.indels, fixed,
                        algorithm, collapse, weight,
                        verbose, matches.as)
-    if (is.null(which_pp_excluded))
+    if (is.null(which_pp_excluded) && matches.as %in%
+        c("MATCHES_AS_WHICH", "MATCHES_AS_COUNTS"))
         return(ans)
     if (matches.as == "MATCHES_AS_WHICH") {
         ## vwhichPDict()
@@ -599,8 +598,51 @@
         }
         return(ans)
     }
-    ## vmatchPDict()
-    stop("vmatchPDict() is not supported yet, sorry")
+    if(matches.as == "MATCHES_AS_ENDS") {
+      ## vmatchPDict() by ends
+      ## converting all the results to MIndex objects and then wrapping the result
+      for(i in seq_along(ans)) {
+        ans[[i]] <- new("ByPos_MIndex",
+                        width0=width(pdict),
+                        NAMES=names(pdict),
+                        ends=ans[[i]])
+      }
+
+      names(ans) <- names(subject)
+      ans <- as(ans, "MIndexList")
+      return(ans)
+    }
+    if(matches.as == "MATCHES_AS_RANGES") {
+      ## vmatchPDict() by ranges
+      ## This is the same as MATCHES_AS_ENDS, except ans[[i]] holds two lists
+      ## - The first list has the start position for each range
+      ## - The second list has the width of the range (including the start)
+
+      for(i in seq_along(ans)){
+        ## First we have to move from starts to ends to align with the
+        ## Pos_MIndex constructor
+        ans_start <- ans[[i]][[1]]
+        ans_width <- ans[[i]][[2]]
+        for(j in seq_along(ans_start)){
+          if(!is.null(ans_start[[j]])){
+            ## Note that the width includes the start, so we subtract 1
+            ans_start[[j]] <- ans_start[[j]] + ans_width[[j]] - 1L
+          }
+        }
+        ## Now ans_start holds the **end** position
+        ## width(pdict) will be equivalent to ifelse(is.null(w), 0, w[1]) for
+        ##   w in ans_width
+        ans[[i]] <- new("ByPos_MIndex",
+                        width0=width(pdict),
+                        NAMES=names(pdict),
+                        ends=ans_start)
+      }
+
+      names(ans) <- names(subject)
+      ans <- as(ans, "MIndexList")
+      return(ans)
+    }
+    warning("Failed to post-process output of .vmatch(pdict, subject)")
     return(ans)
 }
 
@@ -771,8 +813,8 @@ setMethod("whichPDict", "MaskedXString",
 ###     'lapply(subject, function(s) whichPDict(pdict, s, ...))'.
 ###      The returned object is a list of length N.
 ###   o vcountPDict(): returns an M x N matrix of integers.
-###   o vmatchPDict(): not supported yet! (first we need a container to
-###     store the results)
+###   o vmatchPDict(): returns an MIndexList (CompressedList of MIndex objects)
+###      This contains N MIndex objects, each with up to M entries
 ###
 
 setGeneric("vmatchPDict", signature="subject",
@@ -787,7 +829,10 @@ setMethod("vmatchPDict", "ANY",
     function(pdict, subject,
              max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE,
              algorithm="auto", verbose=FALSE)
-        stop("vmatchPDict() is not ready yet, sorry")
+         .vmatchPDict(pdict, subject,
+                      max.mismatch, min.mismatch, with.indels, fixed,
+                      algorithm, collapse=FALSE, weight=1L,
+                      verbose, matches.as="MATCHES_AS_ENDS")
 )
 
 ### Dispatch on 'subject' (see signature of generic).

diff --git a/man/matchPDict-exact.Rd b/man/matchPDict-exact.Rd
@@ -55,9 +55,10 @@
   and \code{whichPDict} returns the "who" information i.e. which patterns
   in the input dictionary have at least one match.
 
-  \code{vcountPDict} and \code{vwhichPDict} are vectorized versions
-  of \code{countPDict} and \code{whichPDict}, respectively, that is,
-  they work on a set of reference sequences in a vectorized fashion.
+  \code{vcountPDict}, \code{vwhichPDict}, and \code{vmatchPDict} are vectorized
+  versions of \code{countPDict}, \code{whichPDict}, and \code{matchPDict}
+  respectively, that is, they work on a set of reference sequences in a
+  vectorized fashion.
 
   This man page shows how to use these functions (aka the \code{*PDict}
   functions) for exact matching of a constant width dictionary i.e.
@@ -86,6 +87,9 @@ vcountPDict(pdict, subject,
 vwhichPDict(pdict, subject,
             max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE,
             algorithm="auto", verbose=FALSE)
+vmatchPDict(pdict, subject,
+           max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE,
+           algorithm="auto", verbose=FALSE)
 }
 
 \arguments{
@@ -111,7 +115,7 @@ vwhichPDict(pdict, subject,
     \code{whichPDict}.
 
     An \link{XStringSet} object containing the subject sequences
-    for \code{vcountPDict} and \code{vwhichPDict}.
+    for \code{vmatchPDict}, \code{vcountPDict}, \and code{vwhichPDict}.
 
     If \code{pdict} is a \link{PDict} object (i.e. a preprocessed
     dictionary), then \code{subject} must be of base class \link{DNAString}.
@@ -210,6 +214,10 @@ vwhichPDict(pdict, subject,
   (see above for the details).
 
   \code{vwhichPDict} returns a list of \code{N} integer vectors.
+
+  \code{vmatchPDict} returns an \link{MIndexList}, which is essentially a list
+  of length \code{N} where each entry is an \link{MIndex} object (as would be
+  obtained from running \code{matchPDict} on each of the sequences).
 }
 
 \author{H. Pagès}
@@ -513,7 +521,7 @@ if (interactive()) {
   pdict2 <- PDict(probes2)
 
   ## Get the mapping from probes1 to probes2 (based on exact matching):
-  map1to2 <- vwhichPDict(pdict2, probes1) 
+  map1to2 <- vwhichPDict(pdict2, probes1)
 
   ## The following helper function uses the probe level mapping to induce
   ## the mapping at the probe set IDs level (from hgu95av2 to hgu133a).

diff --git a/src/match_pdict.c b/src/match_pdict.c
@@ -480,6 +480,28 @@ static SEXP vcount_XStringSet_XStringSet(SEXP pattern,
 	return ans;
 }
 
+SEXP vmatch_PDict3Parts_XStringSet_helper(SEXP pptb, HeadTail *headtail, MatchPDictBuf *mpdb,
+																					SEXP xstringset,
+																					SEXP max_mismatch, SEXP min_mismatch,
+																					SEXP fixed){
+	XStringSet_holder subject;
+	int subj_length;
+	Chars_holder subj_elt;
+
+	subject = _hold_XStringSet(xstringset);
+	subj_length = _get_length_from_XStringSet_holder(&subject);
+
+	SEXP RETURN_LIST = PROTECT(NEW_LIST(subj_length));
+	for(int i=0; i<subj_length; i++){
+		subj_elt = _get_elt_from_XStringSet_holder(&subject, i);
+		match_pdict(pptb, headtail, &subj_elt, max_mismatch, min_mismatch, fixed, mpdb);
+		SET_ELEMENT(RETURN_LIST, i, _MatchBuf_as_SEXP(&(mpdb->matches), R_NilValue));
+		_MatchPDictBuf_flush(mpdb);
+	}
+	UNPROTECT(1);
+	return RETURN_LIST;
+}
+
 /* --- .Call ENTRY POINT --- */
 SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail,
 		SEXP subject,
@@ -495,10 +517,12 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail,
 	matchpdict_buf = new_MatchPDictBuf_from_PDict3Parts(matches_as,
 				pptb, pdict_head, pdict_tail);
 	switch (matchpdict_buf.matches.ms_code) {
-	    case MATCHES_AS_NULL:
-		error("vmatch_PDict3Parts_XStringSet() does not support "
-		      "'matches_as=\"%s\"' yet, sorry",
-		      CHAR(STRING_ELT(matches_as, 0)));
+			case MATCHES_AS_RANGES:
+			case MATCHES_AS_ENDS:
+	  return vmatch_PDict3Parts_XStringSet_helper(pptb, &headtail,
+	    																					&matchpdict_buf, subject,
+																								max_mismatch, min_mismatch,
+																								fixed);
 	    case MATCHES_AS_WHICH:
 		return vwhich_PDict3Parts_XStringSet(pptb, &headtail,
 				subject,
@@ -510,8 +534,11 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail,
 				max_mismatch, min_mismatch, fixed,
 				collapse, weight,
 				&matchpdict_buf);
+			default:
+		error("vmatch_PDict3Parts_XStringSet() does not support "
+		     "'matches_as=\"%s\"' yet, sorry",
+		      CHAR(STRING_ELT(matches_as, 0)));
 	}
-	error("vmatchPDict() is not supported yet, sorry");
 	return R_NilValue;
 }
 
@@ -545,7 +572,8 @@ SEXP vmatch_XStringSet_XStringSet(SEXP pattern,
 				with_indels, fixed,
 				algorithm, collapse, weight);
 	}
-	error("vmatchPDict() is not supported yet, sorry");
+	error("vmatchPDict() on two XStringSets is not supported yet. "
+        "Please use a PDict object instead (see `?PDict` for more info).");
 	return R_NilValue;
 }
 
diff --git a/tests/testthat/test-MIndexListClass.R b/tests/testthat/test-MIndexListClass.R