Skip to content
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Collate: utils.R
misc.R
SparseList-class.R
MIndex-class.R
MIndexList-class.R
lowlevel-matching.R
match-utils.R
matchPattern.R
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ exportMethods(

exportClasses(
#SparseList,
MIndex, ByPos_MIndex,
MIndex, ByPos_MIndex, MIndexList,
PreprocessedTB, Twobit, ACtree2,
PDict3Parts,
PDict, TB_PDict, MTB_PDict, Expanded_TB_PDict
Expand Down
76 changes: 76 additions & 0 deletions R/MIndexList-class.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
### =========================================================================
### MIndexList objects
### -------------------------------------------------------------------------
###

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The "MIndexList" class.
###
### This class serves as a base class to hold collections of MIndex objects.
###
### As with MIndex, in normal operations, the user should never need to create
### MIndex objects directly or to modify existing ones. Those objects are
### typically returned by a sequence matching/alignment function like
### vmatchPDict(). For this reason, coercion methods are relatively barebones
### and will throw errors rather than try to correct
###

setClass("MIndexList",
contains="CompressedList",
representation(
# "VIRTUAL",
unlistData="MIndex"
),
prototype(
elementType="MIndex"
)
)


.from_list_to_MIndexList <- function(from)
{
## keeping ans_class variable for potential future extensions
ans_class <- "MIndexList"

## ensure elements are all MIndex objects
if(!all(vapply(from, inherits, logical(1L), what="MIndex")))
stop("Only lists of MIndex objects can be coerced to MIndexList")
IRanges:::new_CompressedList_from_list(ans_class, from)
}

## barebones, but users shouldn't be calling this directly
## expecting to be used internally, list coercion provided for QoL
setAs("list", "MIndexList", function(from) .from_list_to_MIndexList(from))
setAs("List", "MIndexList", function(from) .from_list_to_MIndexList(as.list(from)))

.from_MIndexList_to_compact_char <- function(object){
lapply(object, \(m_ind){
paste(length(m_ind), "patterns,",
sum(lengths(m_ind)), "matches")
})
}

setMethod("show", "MIndexList",
function(object) {
lo <- length(object)
k <- min(5, length(object))
diffK <- lo - 5
cat(classNameForDisplay(object), " of length ", lo,
"\n", sep = "")

repr <- .from_MIndexList_to_compact_char(head(object, k))
IRanges:::.showAtomicList(CharacterList(repr), minLines=10L)
if (diffK > 0)
cat("...\n<", diffK,
ifelse(diffK == 1,
" more element>\n", " more elements>\n"),
sep="")
}
)

setMethod("showAsCell", "MIndexList",
function(object){
repr <- .from_MIndexList_to_compact_char(object)
showAsCell(CharacterList(repr))
}
)
69 changes: 57 additions & 12 deletions R/matchPDict.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
### > library(BSgenome.Hsapiens.UCSC.hg18)
### > chr1 <- Hsapiens$chr1
### > system.time(end_index <- endIndex(matchPDict(pdict, chr1)))
### user system elapsed
### user system elapsed
### 50.663 0.000 50.763
### > count_index <- sapply(end_index, length)
### > table(count_index)
Expand Down Expand Up @@ -64,8 +64,8 @@
### 1M 36 2.5 sec 717M 106 sec 103 sec 0
### 10M 36 56 sec 6724M 351 sec 200 sec 0
### 10M 12 7.5 sec 340M 227 sec 216 sec 100M
### 30M 12 27 sec 523M 491 sec ?
###
### 30M 12 27 sec 523M 491 sec ?
###
### III. Inexact matching
### ---------------------
### pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2)
Expand Down Expand Up @@ -561,10 +561,8 @@
if (!identical(weight, 1L))
warning("'weight' is ignored when 'collapse=FALSE'")
}
} else {
## vmatchPDict()
stop("vmatchPDict() is not supported yet, sorry")
}

## We are doing our own dispatch here, based on the type of 'pdict'.
## TODO: Revisit this. Would probably be a better design to use a
## generic/methods approach and rely on the standard dispatch mechanism.
Expand All @@ -584,7 +582,8 @@
max.mismatch, min.mismatch, with.indels, fixed,
algorithm, collapse, weight,
verbose, matches.as)
if (is.null(which_pp_excluded))
if (is.null(which_pp_excluded) && matches.as %in%
c("MATCHES_AS_WHICH", "MATCHES_AS_COUNTS"))
return(ans)
if (matches.as == "MATCHES_AS_WHICH") {
## vwhichPDict()
Expand All @@ -599,8 +598,51 @@
}
return(ans)
}
## vmatchPDict()
stop("vmatchPDict() is not supported yet, sorry")
if(matches.as == "MATCHES_AS_ENDS") {
## vmatchPDict() by ends
## converting all the results to MIndex objects and then wrapping the result
for(i in seq_along(ans)) {
ans[[i]] <- new("ByPos_MIndex",
width0=width(pdict),
NAMES=names(pdict),
ends=ans[[i]])
}

names(ans) <- names(subject)
ans <- as(ans, "MIndexList")
return(ans)
}
if(matches.as == "MATCHES_AS_RANGES") {
## vmatchPDict() by ranges
## This is the same as MATCHES_AS_ENDS, except ans[[i]] holds two lists
## - The first list has the start position for each range
## - The second list has the width of the range (including the start)

for(i in seq_along(ans)){
## First we have to move from starts to ends to align with the
## Pos_MIndex constructor
ans_start <- ans[[i]][[1]]
ans_width <- ans[[i]][[2]]
for(j in seq_along(ans_start)){
if(!is.null(ans_start[[j]])){
## Note that the width includes the start, so we subtract 1
ans_start[[j]] <- ans_start[[j]] + ans_width[[j]] - 1L
}
}
## Now ans_start holds the **end** position
## width(pdict) will be equivalent to ifelse(is.null(w), 0, w[1]) for
## w in ans_width
ans[[i]] <- new("ByPos_MIndex",
width0=width(pdict),
NAMES=names(pdict),
ends=ans_start)
}

names(ans) <- names(subject)
ans <- as(ans, "MIndexList")
return(ans)
}
warning("Failed to post-process output of .vmatch(pdict, subject)")
return(ans)
}

Expand Down Expand Up @@ -771,8 +813,8 @@ setMethod("whichPDict", "MaskedXString",
### 'lapply(subject, function(s) whichPDict(pdict, s, ...))'.
### The returned object is a list of length N.
### o vcountPDict(): returns an M x N matrix of integers.
### o vmatchPDict(): not supported yet! (first we need a container to
### store the results)
### o vmatchPDict(): returns an MIndexList (CompressedList of MIndex objects)
### This contains N MIndex objects, each with up to M entries
###

setGeneric("vmatchPDict", signature="subject",
Expand All @@ -787,7 +829,10 @@ setMethod("vmatchPDict", "ANY",
function(pdict, subject,
max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE,
algorithm="auto", verbose=FALSE)
stop("vmatchPDict() is not ready yet, sorry")
.vmatchPDict(pdict, subject,
max.mismatch, min.mismatch, with.indels, fixed,
algorithm, collapse=FALSE, weight=1L,
verbose, matches.as="MATCHES_AS_ENDS")
)

### Dispatch on 'subject' (see signature of generic).
Expand Down
18 changes: 13 additions & 5 deletions man/matchPDict-exact.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@
and \code{whichPDict} returns the "who" information i.e. which patterns
in the input dictionary have at least one match.

\code{vcountPDict} and \code{vwhichPDict} are vectorized versions
of \code{countPDict} and \code{whichPDict}, respectively, that is,
they work on a set of reference sequences in a vectorized fashion.
\code{vcountPDict}, \code{vwhichPDict}, and \code{vmatchPDict} are vectorized
versions of \code{countPDict}, \code{whichPDict}, and \code{matchPDict}
respectively, that is, they work on a set of reference sequences in a
vectorized fashion.

This man page shows how to use these functions (aka the \code{*PDict}
functions) for exact matching of a constant width dictionary i.e.
Expand Down Expand Up @@ -86,6 +87,9 @@ vcountPDict(pdict, subject,
vwhichPDict(pdict, subject,
max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE,
algorithm="auto", verbose=FALSE)
vmatchPDict(pdict, subject,
max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE,
algorithm="auto", verbose=FALSE)
}

\arguments{
Expand All @@ -111,7 +115,7 @@ vwhichPDict(pdict, subject,
\code{whichPDict}.

An \link{XStringSet} object containing the subject sequences
for \code{vcountPDict} and \code{vwhichPDict}.
for \code{vmatchPDict}, \code{vcountPDict}, \and code{vwhichPDict}.

If \code{pdict} is a \link{PDict} object (i.e. a preprocessed
dictionary), then \code{subject} must be of base class \link{DNAString}.
Expand Down Expand Up @@ -210,6 +214,10 @@ vwhichPDict(pdict, subject,
(see above for the details).

\code{vwhichPDict} returns a list of \code{N} integer vectors.

\code{vmatchPDict} returns an \link{MIndexList}, which is essentially a list
of length \code{N} where each entry is an \link{MIndex} object (as would be
obtained from running \code{matchPDict} on each of the sequences).
}

\author{H. Pagès}
Expand Down Expand Up @@ -513,7 +521,7 @@ if (interactive()) {
pdict2 <- PDict(probes2)

## Get the mapping from probes1 to probes2 (based on exact matching):
map1to2 <- vwhichPDict(pdict2, probes1)
map1to2 <- vwhichPDict(pdict2, probes1)

## The following helper function uses the probe level mapping to induce
## the mapping at the probe set IDs level (from hgu95av2 to hgu133a).
Expand Down
40 changes: 34 additions & 6 deletions src/match_pdict.c
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,28 @@ static SEXP vcount_XStringSet_XStringSet(SEXP pattern,
return ans;
}

SEXP vmatch_PDict3Parts_XStringSet_helper(SEXP pptb, HeadTail *headtail, MatchPDictBuf *mpdb,
SEXP xstringset,
SEXP max_mismatch, SEXP min_mismatch,
SEXP fixed){
XStringSet_holder subject;
int subj_length;
Chars_holder subj_elt;

subject = _hold_XStringSet(xstringset);
subj_length = _get_length_from_XStringSet_holder(&subject);

SEXP RETURN_LIST = PROTECT(NEW_LIST(subj_length));
for(int i=0; i<subj_length; i++){
subj_elt = _get_elt_from_XStringSet_holder(&subject, i);
match_pdict(pptb, headtail, &subj_elt, max_mismatch, min_mismatch, fixed, mpdb);
SET_ELEMENT(RETURN_LIST, i, _MatchBuf_as_SEXP(&(mpdb->matches), R_NilValue));
_MatchPDictBuf_flush(mpdb);
}
UNPROTECT(1);
return RETURN_LIST;
}

/* --- .Call ENTRY POINT --- */
SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail,
SEXP subject,
Expand All @@ -495,10 +517,12 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail,
matchpdict_buf = new_MatchPDictBuf_from_PDict3Parts(matches_as,
pptb, pdict_head, pdict_tail);
switch (matchpdict_buf.matches.ms_code) {
case MATCHES_AS_NULL:
error("vmatch_PDict3Parts_XStringSet() does not support "
"'matches_as=\"%s\"' yet, sorry",
CHAR(STRING_ELT(matches_as, 0)));
case MATCHES_AS_RANGES:
case MATCHES_AS_ENDS:
return vmatch_PDict3Parts_XStringSet_helper(pptb, &headtail,
&matchpdict_buf, subject,
max_mismatch, min_mismatch,
fixed);
case MATCHES_AS_WHICH:
return vwhich_PDict3Parts_XStringSet(pptb, &headtail,
subject,
Expand All @@ -510,8 +534,11 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail,
max_mismatch, min_mismatch, fixed,
collapse, weight,
&matchpdict_buf);
default:
error("vmatch_PDict3Parts_XStringSet() does not support "
"'matches_as=\"%s\"' yet, sorry",
CHAR(STRING_ELT(matches_as, 0)));
}
error("vmatchPDict() is not supported yet, sorry");
return R_NilValue;
}

Expand Down Expand Up @@ -545,7 +572,8 @@ SEXP vmatch_XStringSet_XStringSet(SEXP pattern,
with_indels, fixed,
algorithm, collapse, weight);
}
error("vmatchPDict() is not supported yet, sorry");
error("vmatchPDict() on two XStringSets is not supported yet. "
"Please use a PDict object instead (see `?PDict` for more info).");
return R_NilValue;
}

Empty file.
Loading
Loading