diff --git a/DESCRIPTION b/DESCRIPTION index 4c4f60b6..a18e60bd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,8 @@ Suggests: tinytest, covr, knitr, - rmarkdown + rmarkdown, + arrow Collate: 'clean_ProteinProspector.R' 'clean_Metamorpheus.R' diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index e972eb88..97272d0e 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -1,62 +1,183 @@ #' Clean raw Diann files #' @param msstats_object an object of class `MSstatsDIANNFiles`. #' @param MBR True if analysis was done with match between runs -#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead. +#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +#' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity. #' @return data.table #' @importFrom stats na.omit #' @keywords internal -.cleanRawDIANN = function(msstats_object, MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected") { - dn_input = getInputFile(msstats_object, "input") - dn_input = data.table::as.data.table(dn_input) - - if (!is.element("PrecursorMz", colnames(dn_input))) { - dn_input[, PrecursorMz := NA] - } - if (!is.element('FragmentInfo', colnames(dn_input))) { - dn_input[, FragmentInfo := NA] - } - req_cols = c('ProteinNames', 'StrippedSequence', - 'ModifiedSequence', 'PrecursorCharge', - quantificationColumn, 'QValue', - 'PrecursorMz', 'FragmentInfo', 'Run') - if (MBR) { - req_cols = c(req_cols, c('LibQValue', 'LibPGQValue')) - } else{ - req_cols = c(req_cols, c('GlobalQValue', 'GlobalPGQValue')) - } - dn_input = dn_input[, req_cols, with = FALSE] - dn_input = dn_input[, lapply(.SD, function(x) unlist(tstrsplit(x, ";"))), - .SDcols = c(quantificationColumn, "FragmentInfo"), - by = setdiff(colnames(dn_input), c("FragmentInfo", quantificationColumn))] - if (all(is.na(dn_input[["FragmentInfo"]]))) { - dn_input[, FragmentInfo := paste0("Frag", 1:.N), - by = c("ProteinNames", "ModifiedSequence", "PrecursorCharge", "Run")] - } - dn_input[, (quantificationColumn) := lapply(.SD, as.numeric), .SDcols = quantificationColumn] - dn_input[, FragmentIon := sub('\\^\\.\\*', '', FragmentInfo)] - if (any(grepl("/", dn_input$FragmentInfo))) { - dn_input[, ProductCharge := unlist(strsplit(FragmentInfo, split = "/"))[[1]], by = FragmentInfo] - dn_input[, ProductCharge := strtoi(sub("\\.\\*\\^", "", ProductCharge))] - } else { - dn_input[, ProductCharge := 1] - } - dn_input = dn_input[!grepl("NH3", FragmentIon), ] - dn_input = dn_input[!grepl("H2O", FragmentIon), ] - dn_input = na.omit(dn_input, cols = quantificationColumn) - data.table::setnames(dn_input, old = c('ProteinNames', 'StrippedSequence', - 'ModifiedSequence','PrecursorCharge', - quantificationColumn, 'QValue', - 'PrecursorMz', 'FragmentIon','Run', - 'ProductCharge'), - new = c('ProteinName', 'PeptideSequence', - 'PeptideModifiedSequence','PrecursorCharge', - 'Intensity', 'DetectionQValue', - 'PrecursorMz', 'FragmentIon','Run', - 'ProductCharge'), - skip_absent = TRUE) - dn_input[, PeptideSequence := NULL] - setnames(dn_input, "PeptideModifiedSequence", "PeptideSequence") - .logSuccess("DIANN", "clean") - dn_input +.cleanRawDIANN <- function(msstats_object, MBR = TRUE, + quantificationColumn = "FragmentQuantCorrected") { + dn_input <- getInputFile(msstats_object, "input") + dn_input <- data.table::as.data.table(dn_input) + + # Process quantification columns + quantificationColumn <- .cleanDIANNProcessQuantificationColumns(dn_input, quantificationColumn) + + # Add missing columns + dn_input <- .cleanDIANNAddMissingColumns(dn_input) + + # Select required columns + dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR) + + # Split concatenated values + dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn) + + # Process fragment information + dn_input <- .cleanDIANNProcessFragmentInfo(dn_input, quantificationColumn) + + # Clean and filter data + dn_input <- .cleanDIANNCleanAndFilterData(dn_input, quantificationColumn) + + # Rename columns + dn_input <- .cleanDIANNRenameColumns(dn_input, quantificationColumn) + + .logSuccess("DIANN", "clean") + dn_input } + +#' Process quantification columns for DIANN 2.0 format +#' @param dn_input data.table input +#' @param quantificationColumn quantification column name +#' @return updated quantification column name +#' @noRd +.cleanDIANNProcessQuantificationColumns <- function(dn_input, quantificationColumn) { + if (quantificationColumn == "auto") { + fragment_columns <- grep("^Fr[0-9]+Quantity$", names(dn_input), value = TRUE) + if (length(fragment_columns) == 0) { + stop("No fragment quantification columns found. Please check your input.") + } + dn_input[, FragmentQuantCorrected := do.call(paste, c(.SD, sep = ";")), + .SDcols = fragment_columns] + quantificationColumn <- "FragmentQuantCorrected" + } + return(quantificationColumn) +} + +#' Add missing required columns +#' @param dn_input data.table input +#' @return data.table with missing columns added +#' @noRd +.cleanDIANNAddMissingColumns <- function(dn_input) { + if (!is.element("PrecursorMz", colnames(dn_input))) { + dn_input[, PrecursorMz := NA] + } + if (!is.element('FragmentInfo', colnames(dn_input))) { + dn_input[, FragmentInfo := NA] + } + return(dn_input) +} + +#' Select required columns based on MBR setting +#' @param dn_input data.table input +#' @param quantificationColumn quantification column name +#' @param MBR logical indicating if match between runs was used +#' @return data.table with selected columns +#' @noRd +.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR) { + base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', + 'PrecursorCharge', quantificationColumn, 'QValue', + 'PrecursorMz', 'FragmentInfo', 'Run') + + mbr_cols <- if (MBR) { + c('LibQValue', 'LibPGQValue') + } else { + c('GlobalQValue', 'GlobalPGQValue') + } + + req_cols <- c(base_cols, mbr_cols) + return(dn_input[, req_cols, with = FALSE]) +} + +#' Split concatenated values in quantification and fragment info columns +#' @param dn_input data.table input +#' @param quantificationColumn quantification column name +#' @return data.table with split values +#' @noRd +.cleanDIANNSplitConcatenatedValues <- function(dn_input, quantificationColumn) { + split_cols <- c(quantificationColumn, "FragmentInfo") + by_cols <- setdiff(colnames(dn_input), split_cols) + + dn_input <- dn_input[, lapply(.SD, function(x) unlist(tstrsplit(x, ";"))), + .SDcols = split_cols, + by = by_cols] + return(dn_input) +} + +#' Process fragment information and add derived columns +#' @param dn_input data.table input +#' @param quantificationColumn quantification column name +#' @return data.table with processed fragment info +#' @noRd +.cleanDIANNProcessFragmentInfo <- function(dn_input, quantificationColumn) { + # Generate fragment info if missing + if (all(is.na(dn_input[["FragmentInfo"]]))) { + dn_input[, FragmentInfo := paste0("Frag", 1:.N), + by = c("ProteinNames", "ModifiedSequence", "PrecursorCharge", "Run")] + } + + # Convert quantification column to numeric + dn_input[, (quantificationColumn) := lapply(.SD, as.numeric), + .SDcols = quantificationColumn] + + # Process fragment ion information + dn_input[, FragmentIon := sub('\\^\\.\\*', '', FragmentInfo)] + + # Extract product charge + if (any(grepl("/", dn_input$FragmentInfo))) { + dn_input[, ProductCharge := .cleanDIANNExtractProductCharge(FragmentInfo), by = FragmentInfo] + } else { + dn_input[, ProductCharge := 1] + } + + return(dn_input) +} + +#' Extract product charge from fragment info +#' @param fragment_info fragment information string +#' @return numeric product charge +#' @noRd +.cleanDIANNExtractProductCharge <- function(fragment_info) { + charge_part <- unlist(strsplit(fragment_info, split = "/"))[[1]] + return(strtoi(sub("\\.\\*\\^", "", charge_part))) +} + +#' Clean and filter data by removing unwanted fragments and NA values +#' @param dn_input data.table input +#' @param quantificationColumn quantification column name +#' @return cleaned data.table +#' @noRd +.cleanDIANNCleanAndFilterData <- function(dn_input, quantificationColumn) { + # Remove NH3 and H2O loss fragments + dn_input <- dn_input[!grepl("NH3", FragmentIon)] + dn_input <- dn_input[!grepl("H2O", FragmentIon)] + + # Remove rows with NA in quantification column + dn_input <- na.omit(dn_input, cols = quantificationColumn) + + return(dn_input) +} + +#' Rename columns to standardized names +#' @param dn_input data.table input +#' @param quantificationColumn quantification column name +#' @return data.table with renamed columns +#' @noRd +.cleanDIANNRenameColumns <- function(dn_input, quantificationColumn) { + old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', + 'PrecursorCharge', quantificationColumn, 'QValue', + 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') + + new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence', + 'PrecursorCharge', 'Intensity', 'DetectionQValue', + 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') + + data.table::setnames(dn_input, old = old_names, new = new_names, skip_absent = TRUE) + + # Clean up peptide sequence columns + dn_input[, PeptideSequence := NULL] + setnames(dn_input, "PeptideModifiedSequence", "PeptideSequence") + + return(dn_input) +} \ No newline at end of file diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R index df3c4bd7..56d1bf0d 100644 --- a/R/converters_DIANNtoMSstatsFormat.R +++ b/R/converters_DIANNtoMSstatsFormat.R @@ -20,7 +20,9 @@ #' @param removeFewMeasurements should proteins with few measurements be removed #' @param removeOxidationMpeptides should peptides with oxidation be removed #' @param removeProtein_with1Feature should proteins with a single feature be removed -#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead. +#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +#' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity. #' @param ... additional parameters to `data.table::fread`. #' #' @return data.frame in the MSstats required format. @@ -30,7 +32,6 @@ #' @export #' #' @examples -#' # See https://github.com/vdemichev/DiaNN/discussions/1525 for workaround for DIANN 2.0 #' input_file_path = system.file("tinytest/raw_data/DIANN/diann_input.tsv", #' package="MSstatsConvert") #' annotation_file_path = system.file("tinytest/raw_data/DIANN/annotation.csv", @@ -40,6 +41,17 @@ #' output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, #' use_log_file = FALSE) #' head(output) +#' +#' # For DIANN 2.0, set quantificationColumn = 'auto' +#' input_file_path_2_0 = system.file("tinytest/raw_data/DIANN/diann_2.0.parquet", +#' package="MSstatsConvert") +#' annotation_file_path_2_0 = system.file("tinytest/raw_data/DIANN/annotation_diann_2.0.csv", +#' package = "MSstatsConvert") +#' input_2_0 = arrow::read_parquet(input_file_path_2_0) +#' annot_2_0 = data.table::fread(annotation_file_path_2_0) +#' output_2_0 = DIANNtoMSstatsFormat(input_2_0, annotation = annot_2_0, MBR = FALSE, +#' use_log_file = FALSE, quantificationColumn = 'auto') +#' head(output_2_0) DIANNtoMSstatsFormat = function(input, annotation = NULL, global_qvalue_cutoff = 0.01, qvalue_cutoff = 0.01, diff --git a/inst/tinytest/raw_data/DIANN/annotation_diann_2.0.csv b/inst/tinytest/raw_data/DIANN/annotation_diann_2.0.csv new file mode 100644 index 00000000..d876c3bd --- /dev/null +++ b/inst/tinytest/raw_data/DIANN/annotation_diann_2.0.csv @@ -0,0 +1,9 @@ +Run,BioReplicate,Condition +Run1,1,Control +Run2,2,Control +Run3,3,Control +Run4,4,Control +Run5,5,Treatment +Run6,6,Treatment +Run7,7,Treatment +Run8,8,Treatment diff --git a/inst/tinytest/raw_data/DIANN/diann_2.0.parquet b/inst/tinytest/raw_data/DIANN/diann_2.0.parquet new file mode 100644 index 00000000..7423eddd Binary files /dev/null and b/inst/tinytest/raw_data/DIANN/diann_2.0.parquet differ diff --git a/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R b/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R index d54011ff..94e3b617 100644 --- a/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R +++ b/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R @@ -16,4 +16,24 @@ expect_true("ProductCharge" %in% colnames(output)) expect_true("IsotopeLabelType" %in% colnames(output)) expect_true("Condition" %in% colnames(output)) expect_true("BioReplicate" %in% colnames(output)) +expect_true("Fraction" %in% colnames(output)) + +# Test DIANNtoMSstatsFormat DIANN 2.0 ------------------------ +input_file_path = system.file("tinytest/raw_data/DIANN/diann_2.0.parquet", package="MSstatsConvert") +annotation_file_path = system.file("tinytest/raw_data/DIANN/annotation_diann_2.0.csv", package = "MSstatsConvert") +input = arrow::read_parquet(input_file_path) +annot = data.table::fread(annotation_file_path) +output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE, quantificationColumn = 'auto') +expect_equal(ncol(output), 11) +expect_equal(nrow(output), 180) +expect_true("Run" %in% colnames(output)) +expect_true("ProteinName" %in% colnames(output)) +expect_true("PeptideSequence" %in% colnames(output)) +expect_true("PrecursorCharge" %in% colnames(output)) +expect_true("Intensity" %in% colnames(output)) +expect_true("FragmentIon" %in% colnames(output)) +expect_true("ProductCharge" %in% colnames(output)) +expect_true("IsotopeLabelType" %in% colnames(output)) +expect_true("Condition" %in% colnames(output)) +expect_true("BioReplicate" %in% colnames(output)) expect_true("Fraction" %in% colnames(output)) \ No newline at end of file diff --git a/man/DIANNtoMSstatsFormat.Rd b/man/DIANNtoMSstatsFormat.Rd index f7c2a68c..269db0a3 100644 --- a/man/DIANNtoMSstatsFormat.Rd +++ b/man/DIANNtoMSstatsFormat.Rd @@ -67,7 +67,9 @@ If \code{append = TRUE}, has to be a valid path to a file.} \item{MBR}{True if analysis was done with match between runs} -\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead.} +\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.} \item{...}{additional parameters to \code{data.table::fread}.} } @@ -78,7 +80,6 @@ data.frame in the MSstats required format. Import Diann files } \examples{ -# See https://github.com/vdemichev/DiaNN/discussions/1525 for workaround for DIANN 2.0 input_file_path = system.file("tinytest/raw_data/DIANN/diann_input.tsv", package="MSstatsConvert") annotation_file_path = system.file("tinytest/raw_data/DIANN/annotation.csv", @@ -88,6 +89,17 @@ annot = data.table::fread(annotation_file_path) output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE) head(output) + +# For DIANN 2.0, set quantificationColumn = 'auto' +input_file_path_2_0 = system.file("tinytest/raw_data/DIANN/diann_2.0.parquet", + package="MSstatsConvert") +annotation_file_path_2_0 = system.file("tinytest/raw_data/DIANN/annotation_diann_2.0.csv", + package = "MSstatsConvert") +input_2_0 = arrow::read_parquet(input_file_path_2_0) +annot_2_0 = data.table::fread(annotation_file_path_2_0) +output_2_0 = DIANNtoMSstatsFormat(input_2_0, annotation = annot_2_0, MBR = FALSE, + use_log_file = FALSE, quantificationColumn = 'auto') +head(output_2_0) } \author{ Elijah Willie diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index 93ce567e..dbe04478 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -116,7 +116,9 @@ removed based on the IsUnique column from Philosopher output} \item{MBR}{True if analysis was done with match between runs} -\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead.} +\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.} \item{qvalue_cutoff}{The q-value cutoff for filtering peaks detected by MBR} } diff --git a/man/dot-cleanRawDIANN.Rd b/man/dot-cleanRawDIANN.Rd index 109d3de9..118195e7 100644 --- a/man/dot-cleanRawDIANN.Rd +++ b/man/dot-cleanRawDIANN.Rd @@ -15,7 +15,9 @@ \item{MBR}{True if analysis was done with match between runs} -\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead.} +\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.} } \value{ data.table