From 316f992ab13bdb97f5ea76ff2de9c99282680f4d Mon Sep 17 00:00:00 2001 From: ahl27 Date: Fri, 7 Jun 2024 14:57:41 -0400 Subject: [PATCH 1/7] first-pass fix at supporting extended raw character sets --- R/XString-class.R | 21 +++++++++++++++++++++ R/zzz.R | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/R/XString-class.R b/R/XString-class.R index 5e6333cc..a70722f0 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -139,6 +139,27 @@ setMethod("extract_character_from_XString_by_ranges", "XString", } ) +### Extended Raw String method. +setMethod("extract_character_from_XString_by_ranges", "BString", + function(x, start, width, collapse=FALSE) + { + ## This is a little hacky -- ideally we'd have a codec object + ## not sure if that's possible with multi-byte characters + ## We also are going to need to support multiple substrings + if(getOption("Biostrings.showRaw")){ + ## Get strings + ss <- XVector:::extract_character_from_XRaw_by_ranges(x, start, width, + collapse=collapse, + lkup=1:256) + ## convert to unicode representation + vapply(ss, \(x) paste(parse(text=paste0("'\\U28",charToRaw(x),"'")), collapse=''), character(1L)) + } else { + ## Run XString Method + callNextMethod() + } + } +) + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### make_XString_from_string() diff --git a/R/zzz.R b/R/zzz.R index 65e22402..e090fbf4 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -16,6 +16,10 @@ option_name <- "Biostrings.coloring" if (!(option_name %in% names(.Options))) options(setNames(list(TRUE), option_name)) + + option_name <- "Biostrings.showRaw" + if (!(option_name %in% names(.Options))) + options(setNames(list(FALSE), option_name)) } .onUnload <- function(libpath) From 48bc6c71d522211f7258e8b357378f5a6810488e Mon Sep 17 00:00:00 2001 From: ahl27 Date: Fri, 7 Jun 2024 17:56:34 -0400 Subject: [PATCH 2/7] fixes issue where only values 0:254 were supported --- R/XString-class.R | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/R/XString-class.R b/R/XString-class.R index a70722f0..f0fa61e7 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -143,16 +143,31 @@ setMethod("extract_character_from_XString_by_ranges", "XString", setMethod("extract_character_from_XString_by_ranges", "BString", function(x, start, width, collapse=FALSE) { - ## This is a little hacky -- ideally we'd have a codec object - ## not sure if that's possible with multi-byte characters - ## We also are going to need to support multiple substrings + ## This is a little hacky -- working with 0s is challenging + ## due to how extract_character_from_XRaw_by_ranges works + ## instead we double map to correctly place zeros if(getOption("Biostrings.showRaw")){ - ## Get strings + ## Get strings, map 0 -> 1 and all others to themselves ss <- XVector:::extract_character_from_XRaw_by_ranges(x, start, width, collapse=collapse, - lkup=1:256) - ## convert to unicode representation - vapply(ss, \(x) paste(parse(text=paste0("'\\U28",charToRaw(x),"'")), collapse=''), character(1L)) + lkup=c(1L,1:255)) + ## Run a second time, mapping 0 -> 0x0F and all others to 0xF0 + ss_onlyzero <- XVector:::extract_character_from_XRaw_by_ranges(x, start, width, + collapse=collapse, + lkup=c(0x0FL,rep(0xF0L, 255))) + + ## Convert to unicode representation + ## for loop because collapse=FALSE could return an object w length > 1 + bitmask <- as.raw(0x0FL) + for(i in seq_along(ss)){ + ## this makes ss_onlyzero map 0 -> 0, n -> 0xFF for n!=0 + ## so then ss & ss_onlyzero = 0 if (ss[i]) == 0 else ss[i] + tmp <- charToRaw(ss[i]) & xor(charToRaw(ss_onlyzero[i]), bitmask) + + ## then we convert the values to the corresponding unicode + ss[i] <- paste(parse(text=paste0("'\\U28", tmp, "'")), collapse='') + } + ss } else { ## Run XString Method callNextMethod() From 3690271a7595a632241a58e6550685917d0bee4c Mon Sep 17 00:00:00 2001 From: ahl27 Date: Sat, 8 Jun 2024 14:57:07 -0400 Subject: [PATCH 3/7] using a codec instead of the previous solution --- R/XString-class.R | 37 ------------------------------------- R/XStringCodec-class.R | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/R/XString-class.R b/R/XString-class.R index f0fa61e7..a29cfe93 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -139,43 +139,6 @@ setMethod("extract_character_from_XString_by_ranges", "XString", } ) -### Extended Raw String method. -setMethod("extract_character_from_XString_by_ranges", "BString", - function(x, start, width, collapse=FALSE) - { - ## This is a little hacky -- working with 0s is challenging - ## due to how extract_character_from_XRaw_by_ranges works - ## instead we double map to correctly place zeros - if(getOption("Biostrings.showRaw")){ - ## Get strings, map 0 -> 1 and all others to themselves - ss <- XVector:::extract_character_from_XRaw_by_ranges(x, start, width, - collapse=collapse, - lkup=c(1L,1:255)) - ## Run a second time, mapping 0 -> 0x0F and all others to 0xF0 - ss_onlyzero <- XVector:::extract_character_from_XRaw_by_ranges(x, start, width, - collapse=collapse, - lkup=c(0x0FL,rep(0xF0L, 255))) - - ## Convert to unicode representation - ## for loop because collapse=FALSE could return an object w length > 1 - bitmask <- as.raw(0x0FL) - for(i in seq_along(ss)){ - ## this makes ss_onlyzero map 0 -> 0, n -> 0xFF for n!=0 - ## so then ss & ss_onlyzero = 0 if (ss[i]) == 0 else ss[i] - tmp <- charToRaw(ss[i]) & xor(charToRaw(ss_onlyzero[i]), bitmask) - - ## then we convert the values to the corresponding unicode - ss[i] <- paste(parse(text=paste0("'\\U28", tmp, "'")), collapse='') - } - ss - } else { - ## Run XString Method - callNextMethod() - } - } -) - - ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### make_XString_from_string() ### diff --git a/R/XStringCodec-class.R b/R/XStringCodec-class.R index 18cf2da8..fb74c245 100644 --- a/R/XStringCodec-class.R +++ b/R/XStringCodec-class.R @@ -221,5 +221,40 @@ AA_CODES <- AAcodes(FALSE) AA_STRING_CODEC <- .XStringCodec.AA(AA_CODES) +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### RawString codec, for showing BStrings with values in 0-255 +### + +RAWcodes <- function(baseOnly){ + # baseOnly argument kept for consistency with other similar XCodes() functions + if (!isTRUEorFALSE(baseOnly)) + stop("'baseOnly' must be TRUE or FALSE") + letters <- 0:255 + letterMap <- 0:255 + + # Non-printable characters can map to space + # non-printable: 0-31, 127, 160, 173 + # unused: 129, 141, 143-144, 157 + #non_printable <- c(seq_len(32), 127, 129, 141, 143, 144, 160, 157, 173) + non_printable <- c(seq_len(32), seq(128,256)) + letterMap[non_printable] <- 32L + + setNames(letters, intToUtf8(letterMap, multiple=TRUE)) +} + +.XStringCodec.Raw <- function(codes){ + letters <- names(codes) + x <- new("XStringCodec", letters[33:127], codes[33:127]) + + # codec should only be used for show, not for encoding + x@enc_lkup <- 0:255 + x@dec_lkup <- c(x@dec_lkup, rep(NA, 129)) + x@dec_lkup[is.na(x@dec_lkup)] <- 32L + x +} + +RAW_CODES <- RAWcodes(FALSE) +RAW_STRING_CODEC <- .XStringCodec.Raw(RAW_CODES) + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### Add extra codecs below... From 14205f1117621f5967fc75976000be54e3c169ad Mon Sep 17 00:00:00 2001 From: ahl27 Date: Sat, 8 Jun 2024 17:48:38 -0400 Subject: [PATCH 4/7] moving back away from codecs --- R/XString-class.R | 45 ++++++++++++++++++++++++++++++++++++++++++ R/XStringCodec-class.R | 36 --------------------------------- R/zzz.R | 18 +++++++++++++++++ 3 files changed, 63 insertions(+), 36 deletions(-) diff --git a/R/XString-class.R b/R/XString-class.R index a29cfe93..2b9e7e93 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -139,6 +139,51 @@ setMethod("extract_character_from_XString_by_ranges", "XString", } ) +### BString methods to support 0:255 input + +## BSTRING_RAW_LOOKUP is initialized in `zzz.R` +## this value is just a backup on the offchance `zzz.R:.onLoad` fails +BSTRING_RAW_LOOKUP <- rawToChar(as.raw(0:255), multiple=TRUE) +setMethod("extract_character_from_XString_by_ranges", "BString", + function(x, start, width, collapse=FALSE) + { + SHOW_RAW <- getOption("Biostrings.showRaw") + if(!SHOW_RAW) callNextMethod() + lkup <- xs_dec_lkup(x) + + ## need to remap null bytes, they have to be in 0:255 + ## so we have to overload some value + if(is.null(lkup)) lkup <- c(255L,1:255) + xs <- XVector:::extract_character_from_XRaw_by_ranges(x, start, width, + collapse=collapse, + lkup=lkup) + ## replace all undisplayable characters + for(i in seq_along(xs)) + xs[i] <- paste(BSTRING_RAW_LOOKUP[as.integer(charToRaw(xs[i]))+1L], collapse='') + xs + } +) + +setMethod("extract_character_from_XString_by_positions", "BString", + function(x, pos, collapse=FALSE) + { + SHOW_RAW <- getOption("Biostrings.showRaw") + if(!SHOW_RAW) callNextMethod() + lkup <- xs_dec_lkup(x) + + ## need to remap null bytes, they have to be in 0:255 + ## so we have to overload some value + if(is.null(lkup)) lkup <- c(255L,1:255) + xs <- XVector:::extract_character_from_XRaw_by_positions(x, pos, + collapse=collapse, + lkup=lkup) + ## replace all undisplayable characters + for(i in seq_along(xs)) + xs[i] <- paste(BSTRING_RAW_LOOKUP[as.integer(charToRaw(xs[i]))+1L], collapse='') + xs + } +) + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### make_XString_from_string() ### diff --git a/R/XStringCodec-class.R b/R/XStringCodec-class.R index fb74c245..8be5cbdb 100644 --- a/R/XStringCodec-class.R +++ b/R/XStringCodec-class.R @@ -220,41 +220,5 @@ AAcodes <- function(baseOnly) AA_CODES <- AAcodes(FALSE) AA_STRING_CODEC <- .XStringCodec.AA(AA_CODES) - -### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### RawString codec, for showing BStrings with values in 0-255 -### - -RAWcodes <- function(baseOnly){ - # baseOnly argument kept for consistency with other similar XCodes() functions - if (!isTRUEorFALSE(baseOnly)) - stop("'baseOnly' must be TRUE or FALSE") - letters <- 0:255 - letterMap <- 0:255 - - # Non-printable characters can map to space - # non-printable: 0-31, 127, 160, 173 - # unused: 129, 141, 143-144, 157 - #non_printable <- c(seq_len(32), 127, 129, 141, 143, 144, 160, 157, 173) - non_printable <- c(seq_len(32), seq(128,256)) - letterMap[non_printable] <- 32L - - setNames(letters, intToUtf8(letterMap, multiple=TRUE)) -} - -.XStringCodec.Raw <- function(codes){ - letters <- names(codes) - x <- new("XStringCodec", letters[33:127], codes[33:127]) - - # codec should only be used for show, not for encoding - x@enc_lkup <- 0:255 - x@dec_lkup <- c(x@dec_lkup, rep(NA, 129)) - x@dec_lkup[is.na(x@dec_lkup)] <- 32L - x -} - -RAW_CODES <- RAWcodes(FALSE) -RAW_STRING_CODEC <- .XStringCodec.Raw(RAW_CODES) - ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### Add extra codecs below... diff --git a/R/zzz.R b/R/zzz.R index e090fbf4..afdc1593 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -20,6 +20,24 @@ option_name <- "Biostrings.showRaw" if (!(option_name %in% names(.Options))) options(setNames(list(FALSE), option_name)) + + ## BString lookup for raw strings + ## 256 char lookup table for 0:255 (note off by one) + ## characters 0-31 and 127-255 are not displayable + ## so positions 1-32 and 128-256 should be replaced + encoding_details <- l10n_info() + bstring_lookup <- rawToChar(as.raw(0:255), multiple=TRUE) + invalid_chars <- c(1:32,128:256) + if(encoding_details$`UTF-8`){ + # braille is nice if supported + # allows for char comparisons after as.character() comparisons + bstring_lookup[invalid_chars] <- as.character(parse(text=paste0("'\\U28", as.raw(95:255), "'"))) + } else if (encoding_details$MBCS){ + # use multibyte question mark if supported + compact_unknown <- rawToChar(as.raw(c(0xef, 0xbf, 0xbd))) + bstring_lookup[invalid_chars] <- compact_unknown + } + BSTRING_RAW_LOOKUP <<- bstring_lookup } .onUnload <- function(libpath) From ef2fca11fb4281965b2a01c3c9da99e8b0e5b255 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Sat, 8 Jun 2024 17:51:24 -0400 Subject: [PATCH 5/7] minor formatting fixes --- R/XString-class.R | 6 ++++-- R/XStringCodec-class.R | 1 + R/zzz.R | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/R/XString-class.R b/R/XString-class.R index 2b9e7e93..8386b196 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -159,7 +159,8 @@ setMethod("extract_character_from_XString_by_ranges", "BString", lkup=lkup) ## replace all undisplayable characters for(i in seq_along(xs)) - xs[i] <- paste(BSTRING_RAW_LOOKUP[as.integer(charToRaw(xs[i]))+1L], collapse='') + xs[i] <- paste(BSTRING_RAW_LOOKUP[as.integer(charToRaw(xs[i]))+1L], + collapse='') xs } ) @@ -179,7 +180,8 @@ setMethod("extract_character_from_XString_by_positions", "BString", lkup=lkup) ## replace all undisplayable characters for(i in seq_along(xs)) - xs[i] <- paste(BSTRING_RAW_LOOKUP[as.integer(charToRaw(xs[i]))+1L], collapse='') + xs[i] <- paste(BSTRING_RAW_LOOKUP[as.integer(charToRaw(xs[i]))+1L], + collapse='') xs } ) diff --git a/R/XStringCodec-class.R b/R/XStringCodec-class.R index 8be5cbdb..18cf2da8 100644 --- a/R/XStringCodec-class.R +++ b/R/XStringCodec-class.R @@ -220,5 +220,6 @@ AAcodes <- function(baseOnly) AA_CODES <- AAcodes(FALSE) AA_STRING_CODEC <- .XStringCodec.AA(AA_CODES) + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### Add extra codecs below... diff --git a/R/zzz.R b/R/zzz.R index afdc1593..04bf4b8d 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -31,7 +31,8 @@ if(encoding_details$`UTF-8`){ # braille is nice if supported # allows for char comparisons after as.character() comparisons - bstring_lookup[invalid_chars] <- as.character(parse(text=paste0("'\\U28", as.raw(95:255), "'"))) + bstring_lookup[invalid_chars] <- + as.character(parse(text=paste0("'\\U28", as.raw(95:255), "'"))) } else if (encoding_details$MBCS){ # use multibyte question mark if supported compact_unknown <- rawToChar(as.raw(c(0xef, 0xbf, 0xbd))) From 734f6c4d1edf59070ba1da66cfbc57679189b8d9 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Mon, 6 Jan 2025 12:42:27 -0500 Subject: [PATCH 6/7] cleans up some methods, removes <<- assignment, disable braille --- R/XString-class.R | 21 +++++++++++++++++---- R/zzz.R | 21 ++++++++++++++------- tests/testthat/test-XString-class.R | 23 +++++++++++++++++++++++ 3 files changed, 54 insertions(+), 11 deletions(-) diff --git a/R/XString-class.R b/R/XString-class.R index 8386b196..0d30bfc2 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -142,13 +142,19 @@ setMethod("extract_character_from_XString_by_ranges", "XString", ### BString methods to support 0:255 input ## BSTRING_RAW_LOOKUP is initialized in `zzz.R` -## this value is just a backup on the offchance `zzz.R:.onLoad` fails -BSTRING_RAW_LOOKUP <- rawToChar(as.raw(0:255), multiple=TRUE) setMethod("extract_character_from_XString_by_ranges", "BString", function(x, start, width, collapse=FALSE) { SHOW_RAW <- getOption("Biostrings.showRaw") + if(!is.logical(SHOW_RAW)){ + warning("Invalid value for option 'Biostrings.showRaw', ", + "resetting to FALSE") + SHOW_RAW <- FALSE + options(Biostrings.showRaw=FALSE) + } if(!SHOW_RAW) callNextMethod() + + bstring_lookup <- get("BSTRING_RAW_LOOKUP", envir=.pkgenv) lkup <- xs_dec_lkup(x) ## need to remap null bytes, they have to be in 0:255 @@ -159,7 +165,7 @@ setMethod("extract_character_from_XString_by_ranges", "BString", lkup=lkup) ## replace all undisplayable characters for(i in seq_along(xs)) - xs[i] <- paste(BSTRING_RAW_LOOKUP[as.integer(charToRaw(xs[i]))+1L], + xs[i] <- paste(bstring_lookup[as.integer(charToRaw(xs[i]))+1L], collapse='') xs } @@ -169,8 +175,15 @@ setMethod("extract_character_from_XString_by_positions", "BString", function(x, pos, collapse=FALSE) { SHOW_RAW <- getOption("Biostrings.showRaw") + if(!is.logical(SHOW_RAW)){ + warning("Invalid value for option 'Biostrings.showRaw', ", + "resetting to FALSE") + SHOW_RAW <- FALSE + options(Biostrings.showRaw=FALSE) + } if(!SHOW_RAW) callNextMethod() lkup <- xs_dec_lkup(x) + bstring_lookup <- get("BSTRING_RAW_LOOKUP", envir=.pkgenv) ## need to remap null bytes, they have to be in 0:255 ## so we have to overload some value @@ -180,7 +193,7 @@ setMethod("extract_character_from_XString_by_positions", "BString", lkup=lkup) ## replace all undisplayable characters for(i in seq_along(xs)) - xs[i] <- paste(BSTRING_RAW_LOOKUP[as.integer(charToRaw(xs[i]))+1L], + xs[i] <- paste(bstring_lookup[as.integer(charToRaw(xs[i]))+1L], collapse='') xs } diff --git a/R/zzz.R b/R/zzz.R index 04bf4b8d..6b03cfec 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,5 +1,7 @@ ### +.pkgenv <- new.env(parent=emptyenv()) + .onLoad <- function(libname, pkgname) { .Call2("init_DNAlkups", @@ -28,17 +30,22 @@ encoding_details <- l10n_info() bstring_lookup <- rawToChar(as.raw(0:255), multiple=TRUE) invalid_chars <- c(1:32,128:256) - if(encoding_details$`UTF-8`){ - # braille is nice if supported - # allows for char comparisons after as.character() comparisons - bstring_lookup[invalid_chars] <- - as.character(parse(text=paste0("'\\U28", as.raw(95:255), "'"))) - } else if (encoding_details$MBCS){ + # if(encoding_details$`UTF-8`){ + # # braille is nice if supported + # # allows for char comparisons after as.character() comparisons + # # I think it's overkill, though...uncomment this section if we need it + # bstring_lookup[invalid_chars] <- + # as.character(parse(text=paste0("'\\U28", as.raw(95:255), "'"))) + # } else + if (encoding_details$MBCS){ # use multibyte question mark if supported compact_unknown <- rawToChar(as.raw(c(0xef, 0xbf, 0xbd))) bstring_lookup[invalid_chars] <- compact_unknown + } else { + # otherwise just use the regular '?' + bstring_lookup[invalid_chars] <- "?" } - BSTRING_RAW_LOOKUP <<- bstring_lookup + assign("BSTRING_RAW_LOOKUP", bstring_lookup, envir=.pkgenv) } .onUnload <- function(libpath) diff --git a/tests/testthat/test-XString-class.R b/tests/testthat/test-XString-class.R index 6253f758..de946148 100644 --- a/tests/testthat/test-XString-class.R +++ b/tests/testthat/test-XString-class.R @@ -250,6 +250,29 @@ test_that("reverse, complement, reverseComplement work correctly", { expect_equal(as.character(reverseComplement(mrna)), .revString(mr_comp)) }) +test_that("BStrings display correctly with full 0-255 value range", { + orig_setting <- getOption("Biostrings.showRaw") + full_bstring <- as(as(as.raw(0:255),"XRaw"),"BString") + options(Biostrings.showRaw=FALSE) + expect_error(extract_character_from_XString_by_ranges(full_bstring, 1L, 256L), + "embedded nul in string") + + ## can't really test MBCS vs. non-MBCS because we can't guarantee + ## the test suites will run on a platform with(out) MBCS + options(Biostrings.showRaw=TRUE) + expect_is(extract_character_from_XString_by_ranges(full_bstring, 1L, 256L), + "character") + + options(Biostrings.showRaw=10) + expect_warning(extract_character_from_XString_by_ranges(BString("ABCD"), 1L, 4L), + "Invalid value for option 'Biostrings.showRaw'") + expect_false(getOption("Biostrings.showRaw")) + + ## make sure we leave the system as we found it + options(Biostrings.showRaw=orig_setting) + +}) + ## Porting RUnit tests test_that("alphabet finds the correct values", { expect_equal(alphabet(DNAString(dnastr)), strsplit(dnastr, "")[[1]]) From 7e86bf25f0416209f74abc49e4925b1daed7de19 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Mon, 6 Jan 2025 13:06:41 -0500 Subject: [PATCH 7/7] fixes bug where Biostrings.showRaw couldn't be turned off, adds note to documentation --- R/XString-class.R | 67 ++++++++++++++++++++++++-------------------- man/XString-class.Rd | 22 ++++++++++++++- 2 files changed, 57 insertions(+), 32 deletions(-) diff --git a/R/XString-class.R b/R/XString-class.R index 0d30bfc2..fbb85294 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -152,22 +152,24 @@ setMethod("extract_character_from_XString_by_ranges", "BString", SHOW_RAW <- FALSE options(Biostrings.showRaw=FALSE) } - if(!SHOW_RAW) callNextMethod() - - bstring_lookup <- get("BSTRING_RAW_LOOKUP", envir=.pkgenv) - lkup <- xs_dec_lkup(x) - - ## need to remap null bytes, they have to be in 0:255 - ## so we have to overload some value - if(is.null(lkup)) lkup <- c(255L,1:255) - xs <- XVector:::extract_character_from_XRaw_by_ranges(x, start, width, - collapse=collapse, - lkup=lkup) - ## replace all undisplayable characters - for(i in seq_along(xs)) - xs[i] <- paste(bstring_lookup[as.integer(charToRaw(xs[i]))+1L], - collapse='') - xs + if(!SHOW_RAW){ + callNextMethod() + } else { + bstring_lookup <- get("BSTRING_RAW_LOOKUP", envir=.pkgenv) + lkup <- xs_dec_lkup(x) + + ## need to remap null bytes, they have to be in 0:255 + ## so we have to overload some value + if(is.null(lkup)) lkup <- c(255L,1:255) + xs <- XVector:::extract_character_from_XRaw_by_ranges(x, start, width, + collapse=collapse, + lkup=lkup) + ## replace all undisplayable characters + for(i in seq_along(xs)) + xs[i] <- paste(bstring_lookup[as.integer(charToRaw(xs[i]))+1L], + collapse='') + xs + } } ) @@ -181,21 +183,24 @@ setMethod("extract_character_from_XString_by_positions", "BString", SHOW_RAW <- FALSE options(Biostrings.showRaw=FALSE) } - if(!SHOW_RAW) callNextMethod() - lkup <- xs_dec_lkup(x) - bstring_lookup <- get("BSTRING_RAW_LOOKUP", envir=.pkgenv) - - ## need to remap null bytes, they have to be in 0:255 - ## so we have to overload some value - if(is.null(lkup)) lkup <- c(255L,1:255) - xs <- XVector:::extract_character_from_XRaw_by_positions(x, pos, - collapse=collapse, - lkup=lkup) - ## replace all undisplayable characters - for(i in seq_along(xs)) - xs[i] <- paste(bstring_lookup[as.integer(charToRaw(xs[i]))+1L], - collapse='') - xs + if(!SHOW_RAW){ + callNextMethod() + } else { + lkup <- xs_dec_lkup(x) + bstring_lookup <- get("BSTRING_RAW_LOOKUP", envir=.pkgenv) + + ## need to remap null bytes, they have to be in 0:255 + ## so we have to overload some value + if(is.null(lkup)) lkup <- c(255L,1:255) + xs <- XVector:::extract_character_from_XRaw_by_positions(x, pos, + collapse=collapse, + lkup=lkup) + ## replace all undisplayable characters + for(i in seq_along(xs)) + xs[i] <- paste(bstring_lookup[as.integer(charToRaw(xs[i]))+1L], + collapse='') + xs + } } ) diff --git a/man/XString-class.Rd b/man/XString-class.Rd index f1f8f3b9..96b67c2a 100644 --- a/man/XString-class.Rd +++ b/man/XString-class.Rd @@ -105,7 +105,7 @@ \describe{ \item{\code{alphabet(x)}:}{ \code{NULL} for a \code{BString} object. - See the corresponding man pages when \code{x} is a + See the corresponding man pages when \code{x} is a \link{DNAString}, \link{RNAString} or \link{AAString} object. } \item{\code{length(x)}:}{ or \code{nchar(x)}: @@ -167,6 +167,14 @@ } } +\note{ +BString objects can technically hold any valid ASCII code, which includes all values in the range 0:255. However, non-displayable characters may cause the BString object to display in a weird format (see Examples for one such case). Even worse, if a 0 value is included in the BString, attempting to display it will throw a cryptic error that looks like this: + +\code{Error in XVector:::extract_character_from_XRaw_by_ranges: embedded nul in string: ...} + +BString objects are intended to hold displayable characters, so this shouldn't be an issue for most cases. However, if you need BString objects to hold non-displayable values, you can set \code{options(Biostrings.showRaw=TRUE)} to fix the formatting of non-displayable characters. Note that this adds some overhead to methods that show BString objects. +} + \author{H. Pagès} \seealso{ @@ -214,6 +222,18 @@ identical(b, 1:length(b)) # FALSE ## Compacting. As a particular type of XVector objects, XString ## objects can optionally be compacted. Compacting is done typically ## before serialization. See ?compact for more information. + +## Non-displayable characters in BStrings +## BString objects support any value, though this isn't encouraged: +b_bad <- as(as(as.raw(c(10,65,10,66,10,67,68)), "XRaw"), "BString") +b_bad ## formatting is all messed up because 10 = \n + +## if you really need to display characters like this, set this option: +options(Biostrings.showRaw=TRUE) +b_bad ## now all 7 characters "display" + +## reseting to default +options(Biostrings.showRaw=FALSE) } \keyword{methods}