Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ Suggests:
tinytest,
covr,
knitr,
rmarkdown
rmarkdown,
arrow
Collate:
'clean_ProteinProspector.R'
'clean_Metamorpheus.R'
Expand Down
231 changes: 176 additions & 55 deletions R/clean_DIANN.R
Original file line number Diff line number Diff line change
@@ -1,62 +1,183 @@
#' Clean raw Diann files
#' @param msstats_object an object of class `MSstatsDIANNFiles`.
#' @param MBR True if analysis was done with match between runs
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead.
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
#' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
#' @return data.table
#' @importFrom stats na.omit
#' @keywords internal
.cleanRawDIANN = function(msstats_object, MBR = TRUE,
quantificationColumn = "FragmentQuantCorrected") {
dn_input = getInputFile(msstats_object, "input")
dn_input = data.table::as.data.table(dn_input)

if (!is.element("PrecursorMz", colnames(dn_input))) {
dn_input[, PrecursorMz := NA]
}
if (!is.element('FragmentInfo', colnames(dn_input))) {
dn_input[, FragmentInfo := NA]
}
req_cols = c('ProteinNames', 'StrippedSequence',
'ModifiedSequence', 'PrecursorCharge',
quantificationColumn, 'QValue',
'PrecursorMz', 'FragmentInfo', 'Run')
if (MBR) {
req_cols = c(req_cols, c('LibQValue', 'LibPGQValue'))
} else{
req_cols = c(req_cols, c('GlobalQValue', 'GlobalPGQValue'))
}
dn_input = dn_input[, req_cols, with = FALSE]
dn_input = dn_input[, lapply(.SD, function(x) unlist(tstrsplit(x, ";"))),
.SDcols = c(quantificationColumn, "FragmentInfo"),
by = setdiff(colnames(dn_input), c("FragmentInfo", quantificationColumn))]
if (all(is.na(dn_input[["FragmentInfo"]]))) {
dn_input[, FragmentInfo := paste0("Frag", 1:.N),
by = c("ProteinNames", "ModifiedSequence", "PrecursorCharge", "Run")]
}
dn_input[, (quantificationColumn) := lapply(.SD, as.numeric), .SDcols = quantificationColumn]
dn_input[, FragmentIon := sub('\\^\\.\\*', '', FragmentInfo)]
if (any(grepl("/", dn_input$FragmentInfo))) {
dn_input[, ProductCharge := unlist(strsplit(FragmentInfo, split = "/"))[[1]], by = FragmentInfo]
dn_input[, ProductCharge := strtoi(sub("\\.\\*\\^", "", ProductCharge))]
} else {
dn_input[, ProductCharge := 1]
}
dn_input = dn_input[!grepl("NH3", FragmentIon), ]
dn_input = dn_input[!grepl("H2O", FragmentIon), ]
dn_input = na.omit(dn_input, cols = quantificationColumn)
data.table::setnames(dn_input, old = c('ProteinNames', 'StrippedSequence',
'ModifiedSequence','PrecursorCharge',
quantificationColumn, 'QValue',
'PrecursorMz', 'FragmentIon','Run',
'ProductCharge'),
new = c('ProteinName', 'PeptideSequence',
'PeptideModifiedSequence','PrecursorCharge',
'Intensity', 'DetectionQValue',
'PrecursorMz', 'FragmentIon','Run',
'ProductCharge'),
skip_absent = TRUE)
dn_input[, PeptideSequence := NULL]
setnames(dn_input, "PeptideModifiedSequence", "PeptideSequence")
.logSuccess("DIANN", "clean")
dn_input
.cleanRawDIANN <- function(msstats_object, MBR = TRUE,
quantificationColumn = "FragmentQuantCorrected") {
dn_input <- getInputFile(msstats_object, "input")
dn_input <- data.table::as.data.table(dn_input)

# Process quantification columns
quantificationColumn <- .cleanDIANNProcessQuantificationColumns(dn_input, quantificationColumn)

# Add missing columns
dn_input <- .cleanDIANNAddMissingColumns(dn_input)

# Select required columns
dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR)

# Split concatenated values
dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn)

# Process fragment information
dn_input <- .cleanDIANNProcessFragmentInfo(dn_input, quantificationColumn)

# Clean and filter data
dn_input <- .cleanDIANNCleanAndFilterData(dn_input, quantificationColumn)

# Rename columns
dn_input <- .cleanDIANNRenameColumns(dn_input, quantificationColumn)

.logSuccess("DIANN", "clean")
dn_input
}

#' Process quantification columns for DIANN 2.0 format
#' @param dn_input data.table input
#' @param quantificationColumn quantification column name
#' @return updated quantification column name
#' @noRd
.cleanDIANNProcessQuantificationColumns <- function(dn_input, quantificationColumn) {
if (quantificationColumn == "auto") {
fragment_columns <- grep("^Fr[0-9]+Quantity$", names(dn_input), value = TRUE)
if (length(fragment_columns) == 0) {
stop("No fragment quantification columns found. Please check your input.")
}
dn_input[, FragmentQuantCorrected := do.call(paste, c(.SD, sep = ";")),
.SDcols = fragment_columns]
quantificationColumn <- "FragmentQuantCorrected"
}
return(quantificationColumn)
}

#' Add missing required columns
#' @param dn_input data.table input
#' @return data.table with missing columns added
#' @noRd
.cleanDIANNAddMissingColumns <- function(dn_input) {
if (!is.element("PrecursorMz", colnames(dn_input))) {
dn_input[, PrecursorMz := NA]
}
if (!is.element('FragmentInfo', colnames(dn_input))) {
dn_input[, FragmentInfo := NA]
}
return(dn_input)
}

#' Select required columns based on MBR setting
#' @param dn_input data.table input
#' @param quantificationColumn quantification column name
#' @param MBR logical indicating if match between runs was used
#' @return data.table with selected columns
#' @noRd
.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR) {
base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
'PrecursorCharge', quantificationColumn, 'QValue',
'PrecursorMz', 'FragmentInfo', 'Run')

mbr_cols <- if (MBR) {
c('LibQValue', 'LibPGQValue')
} else {
c('GlobalQValue', 'GlobalPGQValue')
}

req_cols <- c(base_cols, mbr_cols)
return(dn_input[, req_cols, with = FALSE])
}

#' Split concatenated values in quantification and fragment info columns
#' @param dn_input data.table input
#' @param quantificationColumn quantification column name
#' @return data.table with split values
#' @noRd
.cleanDIANNSplitConcatenatedValues <- function(dn_input, quantificationColumn) {
split_cols <- c(quantificationColumn, "FragmentInfo")
by_cols <- setdiff(colnames(dn_input), split_cols)

dn_input <- dn_input[, lapply(.SD, function(x) unlist(tstrsplit(x, ";"))),
.SDcols = split_cols,
by = by_cols]
return(dn_input)
}

#' Process fragment information and add derived columns
#' @param dn_input data.table input
#' @param quantificationColumn quantification column name
#' @return data.table with processed fragment info
#' @noRd
.cleanDIANNProcessFragmentInfo <- function(dn_input, quantificationColumn) {
# Generate fragment info if missing
if (all(is.na(dn_input[["FragmentInfo"]]))) {
dn_input[, FragmentInfo := paste0("Frag", 1:.N),
by = c("ProteinNames", "ModifiedSequence", "PrecursorCharge", "Run")]
}

# Convert quantification column to numeric
dn_input[, (quantificationColumn) := lapply(.SD, as.numeric),
.SDcols = quantificationColumn]

# Process fragment ion information
dn_input[, FragmentIon := sub('\\^\\.\\*', '', FragmentInfo)]

# Extract product charge
if (any(grepl("/", dn_input$FragmentInfo))) {
dn_input[, ProductCharge := .cleanDIANNExtractProductCharge(FragmentInfo), by = FragmentInfo]
} else {
dn_input[, ProductCharge := 1]
}

return(dn_input)
}

#' Extract product charge from fragment info
#' @param fragment_info fragment information string
#' @return numeric product charge
#' @noRd
.cleanDIANNExtractProductCharge <- function(fragment_info) {
charge_part <- unlist(strsplit(fragment_info, split = "/"))[[1]]
return(strtoi(sub("\\.\\*\\^", "", charge_part)))
}

#' Clean and filter data by removing unwanted fragments and NA values
#' @param dn_input data.table input
#' @param quantificationColumn quantification column name
#' @return cleaned data.table
#' @noRd
.cleanDIANNCleanAndFilterData <- function(dn_input, quantificationColumn) {
# Remove NH3 and H2O loss fragments
dn_input <- dn_input[!grepl("NH3", FragmentIon)]
dn_input <- dn_input[!grepl("H2O", FragmentIon)]

# Remove rows with NA in quantification column
dn_input <- na.omit(dn_input, cols = quantificationColumn)

return(dn_input)
}

#' Rename columns to standardized names
#' @param dn_input data.table input
#' @param quantificationColumn quantification column name
#' @return data.table with renamed columns
#' @noRd
.cleanDIANNRenameColumns <- function(dn_input, quantificationColumn) {
old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
'PrecursorCharge', quantificationColumn, 'QValue',
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')

new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence',
'PrecursorCharge', 'Intensity', 'DetectionQValue',
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')

data.table::setnames(dn_input, old = old_names, new = new_names, skip_absent = TRUE)

# Clean up peptide sequence columns
dn_input[, PeptideSequence := NULL]
setnames(dn_input, "PeptideModifiedSequence", "PeptideSequence")

return(dn_input)
}
16 changes: 14 additions & 2 deletions R/converters_DIANNtoMSstatsFormat.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
#' @param removeFewMeasurements should proteins with few measurements be removed
#' @param removeOxidationMpeptides should peptides with oxidation be removed
#' @param removeProtein_with1Feature should proteins with a single feature be removed
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead.
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
#' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
#' @param ... additional parameters to `data.table::fread`.
#'
#' @return data.frame in the MSstats required format.
Expand All @@ -30,7 +32,6 @@
#' @export
#'
#' @examples
#' # See https://github.com/vdemichev/DiaNN/discussions/1525 for workaround for DIANN 2.0
#' input_file_path = system.file("tinytest/raw_data/DIANN/diann_input.tsv",
#' package="MSstatsConvert")
#' annotation_file_path = system.file("tinytest/raw_data/DIANN/annotation.csv",
Expand All @@ -40,6 +41,17 @@
#' output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE,
#' use_log_file = FALSE)
#' head(output)
#'
#' # For DIANN 2.0, set quantificationColumn = 'auto'
#' input_file_path_2_0 = system.file("tinytest/raw_data/DIANN/diann_2.0.parquet",
#' package="MSstatsConvert")
#' annotation_file_path_2_0 = system.file("tinytest/raw_data/DIANN/annotation_diann_2.0.csv",
#' package = "MSstatsConvert")
#' input_2_0 = arrow::read_parquet(input_file_path_2_0)
#' annot_2_0 = data.table::fread(annotation_file_path_2_0)
#' output_2_0 = DIANNtoMSstatsFormat(input_2_0, annotation = annot_2_0, MBR = FALSE,
#' use_log_file = FALSE, quantificationColumn = 'auto')
#' head(output_2_0)
DIANNtoMSstatsFormat = function(input, annotation = NULL,
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
Expand Down
9 changes: 9 additions & 0 deletions inst/tinytest/raw_data/DIANN/annotation_diann_2.0.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Run,BioReplicate,Condition
Run1,1,Control
Run2,2,Control
Run3,3,Control
Run4,4,Control
Run5,5,Treatment
Run6,6,Treatment
Run7,7,Treatment
Run8,8,Treatment
Binary file added inst/tinytest/raw_data/DIANN/diann_2.0.parquet
Binary file not shown.
20 changes: 20 additions & 0 deletions inst/tinytest/test_converters_DIANNtoMSstatsFormat.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,24 @@ expect_true("ProductCharge" %in% colnames(output))
expect_true("IsotopeLabelType" %in% colnames(output))
expect_true("Condition" %in% colnames(output))
expect_true("BioReplicate" %in% colnames(output))
expect_true("Fraction" %in% colnames(output))

# Test DIANNtoMSstatsFormat DIANN 2.0 ------------------------
input_file_path = system.file("tinytest/raw_data/DIANN/diann_2.0.parquet", package="MSstatsConvert")
annotation_file_path = system.file("tinytest/raw_data/DIANN/annotation_diann_2.0.csv", package = "MSstatsConvert")
input = arrow::read_parquet(input_file_path)
annot = data.table::fread(annotation_file_path)
output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE, quantificationColumn = 'auto')
expect_equal(ncol(output), 11)
expect_equal(nrow(output), 180)
expect_true("Run" %in% colnames(output))
expect_true("ProteinName" %in% colnames(output))
expect_true("PeptideSequence" %in% colnames(output))
expect_true("PrecursorCharge" %in% colnames(output))
expect_true("Intensity" %in% colnames(output))
expect_true("FragmentIon" %in% colnames(output))
expect_true("ProductCharge" %in% colnames(output))
expect_true("IsotopeLabelType" %in% colnames(output))
expect_true("Condition" %in% colnames(output))
expect_true("BioReplicate" %in% colnames(output))
expect_true("Fraction" %in% colnames(output))
16 changes: 14 additions & 2 deletions man/DIANNtoMSstatsFormat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/MSstatsClean.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/dot-cleanRawDIANN.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading