From d0e71eb4e9268d996e6170f4b2a5786c7f7da476 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 25 Aug 2025 13:09:09 +0200 Subject: [PATCH 01/28] Added argparse to AssignToBins --- DIMS/AssignToBins.R | 22 ++++++++++++++-------- DIMS/AssignToBins.nf | 6 ++++-- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index d1c17ba7..88801264 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -1,19 +1,25 @@ # load required packages +library("argparse") suppressPackageStartupMessages(library("xcms")) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "AssignToBins") -mzml_filepath <- cmd_args[1] -breaks_filepath <- cmd_args[2] -resol <- as.numeric(cmd_args[3]) +parser$add_argument("--mzML_filepath", dest = "mzml_filepath", + help = "File path for the mzML file", required = TRUE) +parser$add_argument("--breaks_filepath", dest = "breaks_filepath", + help = "File path for the breaks RData file", required = TRUE) + +args <- parser$parse_args() + +mzml_filepath <- args$mzml_filepath +breaks_filepath <- args$breaks_filepath # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, # trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos -load(breaks_filepath) +load(args$breaks_filepath) # get sample name -techrep_name <- sub("\\..*$", "", basename(mzml_filepath)) +techrep_name <- sub("\\..*$", "", basename(args$mzml_filepath)) options(digits = 16) @@ -25,7 +31,7 @@ pos_bins <- bins neg_bins <- bins # read in the data for 1 sample -raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath)) +raw_data <- suppressMessages(xcms::xcmsRaw(args$mzml_filepath)) # Generate a matrix with retention times and intensities raw_data_matrix <- xcms::rawMat(raw_data) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index d0a7098b..df3b089a 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -1,7 +1,7 @@ process AssignToBins { tag "DIMS AssignToBins ${file_id}" label 'AssignToBins' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -13,7 +13,9 @@ process AssignToBins { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R \\ + --mzML_filepath $mzML_file \\ + --breaks_filepath $breaks_file """ } From d5bac97e77696a51a36991b78fee3a35272280ed Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 25 Aug 2025 13:12:57 +0200 Subject: [PATCH 02/28] Removed unused arguments --- DIMS/AssignToBins.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 88801264..66cc05ee 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -11,9 +11,6 @@ parser$add_argument("--breaks_filepath", dest = "breaks_filepath", args <- parser$parse_args() -mzml_filepath <- args$mzml_filepath -breaks_filepath <- args$breaks_filepath - # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, # trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos load(args$breaks_filepath) From 97d5626fab955e63f988bfacd12eea5bf6ba73e8 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 1 Sep 2025 15:49:24 +0200 Subject: [PATCH 03/28] Fixed linting issues --- DIMS/AssignToBins.R | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 66cc05ee..824bc0ce 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -43,13 +43,13 @@ neg_times_trimmed <- neg_times[neg_times > trim_left_neg & neg_times < trim_righ # get TIC intensities for areas between trim_left and trim_right tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic) colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity") -tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(pos_times_trimmed) & - tic_intensity_persample[ , "retention_time"] < max(pos_times_trimmed), ] -tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(neg_times_trimmed) & - tic_intensity_persample[ , "retention_time"] < max(neg_times_trimmed), ] +tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[, "retention_time"] > min(pos_times_trimmed) & + tic_intensity_persample[, "retention_time"] < max(pos_times_trimmed), ] +tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[, "retention_time"] > min(neg_times_trimmed) & + tic_intensity_persample[, "retention_time"] < max(neg_times_trimmed), ] # calculate weighted mean of intensities for pos and neg separately -mean_pos <- weighted.mean(tic_intensity_pos[ , "tic_intensity"], tic_intensity_pos[ , "tic_intensity"]) -mean_neg <- weighted.mean(tic_intensity_neg[ , "tic_intensity"], tic_intensity_neg[ , "tic_intensity"]) +mean_pos <- weighted.mean(tic_intensity_pos[, "tic_intensity"], tic_intensity_pos[, "tic_intensity"]) +mean_neg <- weighted.mean(tic_intensity_neg[, "tic_intensity"], tic_intensity_neg[, "tic_intensity"]) # intensity per scan should be at least 80% of weighted mean dims_thresh_pos <- 0.8 * mean_pos dims_thresh_neg <- 0.8 * mean_neg @@ -69,17 +69,17 @@ neg_raw_data_matrix <- raw_data_matrix[neg_index, ] # Get index for binning intensity values bin_indices_pos <- cut( - pos_raw_data_matrix[, "mz"], + pos_raw_data_matrix[, "mz"], breaks_fwhm, - include.lowest = TRUE, - right = TRUE, + include.lowest = TRUE, + right = TRUE, labels = FALSE ) bin_indices_neg <- cut( - neg_raw_data_matrix[, "mz"], - breaks_fwhm, - include.lowest = TRUE, - right = TRUE, + neg_raw_data_matrix[, "mz"], + breaks_fwhm, + include.lowest = TRUE, + right = TRUE, labels = FALSE ) @@ -90,8 +90,8 @@ if (nrow(pos_raw_data_matrix) > 0) { pos_raw_data_matrix[is.na(pos_raw_data_matrix[, "intensity"]), "intensity"] <- 0 # aggregate intensities, calculate mean, use only values above dims_thresh_pos aggr_int_pos <- stats::aggregate(pos_raw_data_matrix[, "intensity"], - list(bin_indices_pos), - FUN = function(x) { mean(x[which(x > dims_thresh_pos)]) }) + list(bin_indices_pos), + FUN = function(x) {mean(x[which(x > dims_thresh_pos)])}) # set NA to zero in second column aggr_int_pos[is.na(aggr_int_pos[, 2]), 2] <- 0 pos_bins[aggr_int_pos[, 1]] <- aggr_int_pos[, 2] @@ -101,8 +101,8 @@ if (nrow(neg_raw_data_matrix) > 0) { neg_raw_data_matrix[is.na(neg_raw_data_matrix[, "intensity"]), "intensity"] <- 0 # aggregate intensities, calculate mean, use only values above dims_thresh_neg aggr_int_neg <- stats::aggregate(neg_raw_data_matrix[, "intensity"], - list(bin_indices_neg), - FUN = function(x) { mean(x[which(x > dims_thresh_neg)]) }) + list(bin_indices_neg), + FUN = function(x) {mean(x[which(x > dims_thresh_neg)])}) # set NA to zero in second column aggr_int_neg[is.na(aggr_int_neg[, 2]), 2] <- 0 neg_bins[aggr_int_neg[, 1]] <- aggr_int_neg[, 2] From 8a250379adffdeef04f9925354434949478d44c0 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 1 Sep 2025 15:52:13 +0200 Subject: [PATCH 04/28] Added argparse to AverageTechReplicates and fixed linting issues --- DIMS/AverageTechReplicates.R | 83 ++++++++++++++++++++--------------- DIMS/AverageTechReplicates.nf | 15 ++++--- 2 files changed, 55 insertions(+), 43 deletions(-) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 8e13e634..bfacf70e 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -1,28 +1,37 @@ # adapted from 3-AverageTechReplicates.R # load packages +library("argparse") library("ggplot2") library("gridExtra") -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "AverageTechReplicates") -init_file <- cmd_args[1] -nr_replicates <- as.numeric(cmd_args[2]) -run_name <- cmd_args[3] -dims_matrix <- cmd_args[4] -highest_mz_file <- cmd_args[5] -highest_mz <- get(load(highest_mz_file)) -breaks_filepath <- cmd_args[6] +parser$add_argument("--init_filepath", dest = "init_file", + help = "File path for the init RData file", required = TRUE) +parser$add_argument("-n", "--nr_replicates", dest = "nr_replicates", type = "integer", + help = "Number of replicates", required = TRUE) +parser$add_argument("--run_name", dest = "run_name", + help = "The run name/analysis ID", required = TRUE) +parser$add_argument("--matrix", dest = "dims_matrix", + help = "The matrix used, e.g. Plasma, Research, ...") +parser$add_argument("--highest_mz_file", dest = "highest_mz_file", + help = "File path for the highest Mz RData file", required = TRUE) +parser$add_argument("--breaks_filepath", dest = "breaks_filepath", + help = "File path for the breaks RData file", required = TRUE) + +args <- parser$parse_args() + +highest_mz <- get(load(args$highest_mz_file)) thresh2remove <- 1000000000 remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { # collect list of samples to remove from replication pattern remove_from_group <- NULL - for (sample_nr in 1:length(repl_pattern)){ + for (sample_nr in seq_along(repl_pattern)){ repl_pattern_1sample <- repl_pattern[[sample_nr]] remove <- NULL - for (file_nr in 1:length(repl_pattern_1sample)) { + for (file_nr in seq_along(repl_pattern_1sample)) { if (repl_pattern_1sample[file_nr] %in% bad_samples) { remove <- c(remove, file_nr) } @@ -41,14 +50,14 @@ remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { } # load init_file: contains repl_pattern -load(init_file) +load(args$init_file) # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, # trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos -load(breaks_filepath) +load(args$breaks_filepath) # lower the threshold below which a sample will be removed for DBS and for high m/z -if (dims_matrix == "DBS") { +if (args$dims_matrix == "DBS") { thresh2remove <- 500000000 } if (highest_mz > 700) { @@ -60,7 +69,7 @@ if (highest_mz > 700) { remove_neg <- NULL remove_pos <- NULL cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") -for (sample_nr in 1:length(repl_pattern)) { +for (sample_nr in seq_along(repl_pattern)) { tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) tech_reps_array_pos <- NULL tech_reps_array_neg <- NULL @@ -68,7 +77,7 @@ for (sample_nr in 1:length(repl_pattern)) { sum_pos <- 0 nr_pos <- 0 nr_neg <- 0 - for (file_nr in 1:length(tech_reps)) { + for (file_nr in seq_along(tech_reps)) { load(paste(tech_reps[file_nr], ".RData", sep = "")) cat("\n\nParsing", tech_reps[file_nr]) # negative scanmode @@ -92,7 +101,7 @@ for (sample_nr in 1:length(repl_pattern)) { } tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos) } - # save to file + # save to file if (nr_neg != 0) { sum_neg[, 1] <- sum_neg[, 1] / nr_neg colnames(sum_neg) <- names(repl_pattern)[sample_nr] @@ -105,25 +114,25 @@ for (sample_nr in 1:length(repl_pattern)) { } } -pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) +pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, args$nr_replicates) repl_pattern_filtered <- pattern_list$pattern save(repl_pattern_filtered, file = "negative_repl_pattern.RData") write.table( - remove_neg, - file = "miss_infusions_negative.txt", - row.names = FALSE, - col.names = FALSE, + remove_neg, + file = "miss_infusions_negative.txt", + row.names = FALSE, + col.names = FALSE, sep = "\t" ) -pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) +pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, args$nr_replicates) repl_pattern_filtered <- pattern_list$pattern save(repl_pattern_filtered, file = "positive_repl_pattern.RData") write.table( - remove_pos, - file = "miss_infusions_positive.txt", - row.names = FALSE, - col.names = FALSE, + remove_pos, + file = "miss_infusions_positive.txt", + row.names = FALSE, + col.names = FALSE, sep = "\t" ) @@ -146,10 +155,10 @@ for (file in tic_files) { # create a list with information for all TIC plots tic_plot_list <- list() plot_nr <- 0 -for (sample_nr in c(1:length(repl_pattern))) { +for (sample_nr in seq_along(repl_pattern)) { tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) sample_name <- names(repl_pattern)[sample_nr] - for (file_nr in 1:length(tech_reps)) { + for (file_nr in seq_along(tech_reps)) { plot_nr <- plot_nr + 1 # read file with retention time, intensity and dims_threshold values repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt")) @@ -159,7 +168,7 @@ for (sample_nr in c(1:length(repl_pattern))) { # for replicates with bad TIC, determine what color the border of the plot should be bad_color_pos <- tech_reps[file_nr] %in% remove_pos bad_color_neg <- tech_reps[file_nr] %in% remove_neg - if (bad_color_neg & bad_color_pos) { + if (bad_color_neg && bad_color_pos) { plot_color <- "#F8766D" } else if (bad_color_pos) { plot_color <- "#ED8141" @@ -175,8 +184,10 @@ for (sample_nr in c(1:length(repl_pattern))) { geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) + geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) + geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) + - geom_segment(aes(x = trim_left_pos, y = dims_thresh_pos, xend = trim_right_pos, yend = dims_thresh_pos), colour = "green", lty = 2) + - geom_segment(aes(x = trim_left_neg, y = dims_thresh_neg, xend = trim_right_neg, yend = dims_thresh_neg), colour = "blue", lty = 2) + + geom_segment(aes(x = trim_left_pos, y = dims_thresh_pos, xend = trim_right_pos, yend = dims_thresh_pos), + colour = "green", lty = 2) + + geom_segment(aes(x = trim_left_neg, y = dims_thresh_neg, xend = trim_right_neg, yend = dims_thresh_neg), + colour = "blue", lty = 2) + labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], " || ", sample_name)) + theme(plot.background = element_rect(fill = plot_color), axis.text = element_text(size = 4), @@ -187,19 +198,19 @@ for (sample_nr in c(1:length(repl_pattern))) { } # create a layout matrix dependent on number of replicates -layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE) +layout <- matrix(1:(10 * args$nr_replicates), 10, args$nr_replicates, TRUE) # put TIC plots in matrix tic_plot_pdf <- marrangeGrob( grobs = tic_plot_list, - nrow = 10, ncol = nr_replicates, + nrow = 10, ncol = args$nr_replicates, layout_matrix = layout, top = quote(paste( - "TICs of run", run_name, + "TICs of run", args$run_name, " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", g, "/", npages )) ) # save to file -ggsave(filename = paste0(run_name, "_TICplots.pdf"), +ggsave(filename = paste0(args$run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm") diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index b521a19f..12606cf7 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -1,7 +1,7 @@ process AverageTechReplicates { tag "DIMS AverageTechReplicates" label 'AverageTechReplicates' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -23,12 +23,13 @@ process AverageTechReplicates { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \ - $params.nr_replicates \ - $analysis_id \ - $matrix \ - $highest_mz_file \ - $breaks_file + Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R \\ + --init_filepath $init_file \\ + --nr_replicates $params.nr_replicates \\ + --run_name $analysis_id \\ + --matrix $matrix \\ + --highest_mz_file $highest_mz_file \\ + --breaks_filepath $breaks_file """ } From 4fc8385d81fe9c20dac6f0376a6e1741d92aef96 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 1 Sep 2025 15:52:31 +0200 Subject: [PATCH 05/28] Added argparse to CollectSumAdducts and fixed linting issues --- DIMS/CollectSumAdducts.R | 11 +++++++---- DIMS/CollectSumAdducts.nf | 5 +++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R index 28b5bf0d..10752add 100755 --- a/DIMS/CollectSumAdducts.R +++ b/DIMS/CollectSumAdducts.R @@ -1,13 +1,16 @@ ## Combining all AdductSums part files for each scanmode and # combine intensities if present in both scanmodes +library("argparse") suppressMessages(library("dplyr")) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "GenerateExcel") -preprocessing_scripts_dir <- cmd_args[1] +parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", + help = "File path to the directory containing functions used in this script", required = TRUE) -source(paste0(preprocessing_scripts_dir, "collect_sum_adducts_functions.R")) +args <- parser$parse_args() + +source(paste0(args$preprocessing_scripts_dir, "collect_sum_adducts_functions.R")) # collect all AdductSums part files for each scanmode and save to RData file outlist_tot_pos <- combine_sum_adduct_parts("positive") diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf index 279cd0e2..06aa82c8 100644 --- a/DIMS/CollectSumAdducts.nf +++ b/DIMS/CollectSumAdducts.nf @@ -1,7 +1,7 @@ process CollectSumAdducts { tag "DIMS CollectSumAdducts" label 'CollectSumAdducts' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -13,6 +13,7 @@ process CollectSumAdducts { script: """ - Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R $params.preprocessing_scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R \\ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir """ } From f956c9d4820c34ef9e7284981ca7c34d6565c4d4 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 1 Sep 2025 15:53:16 +0200 Subject: [PATCH 06/28] Added argparse to GenerateExcel and fixed linting issues --- DIMS/GenerateExcel.R | 35 +++++++++++++++++++++-------------- DIMS/GenerateExcel.nf | 7 ++++++- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index c17f5b03..12ee7cea 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -1,4 +1,5 @@ # load required packages +library("argparse") library("ggplot2") library("reshape2") library("openxlsx") @@ -6,17 +7,23 @@ suppressMessages(library("tidyr")) suppressMessages(library("dplyr")) suppressMessages(library("stringr")) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "GenerateExcel") -project <- cmd_args[1] -hmdb_rlvc_file <- cmd_args[2] -z_score <- as.numeric(cmd_args[3]) -export_scripts_dir <- cmd_args[4] -path_metabolite_groups <- cmd_args[5] +parser$add_argument("--project", dest = "project", + help = "Project name", required = TRUE) +parser$add_argument("--hmdb_rlvc_file", dest = "hmdb_rlvc_file", + help = "File path to the HMDB relevance file", required = TRUE) +parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer", + help = "Numeric value, z-score = 1", required = TRUE) +parser$add_argument("--export_scripts_dir", dest = "export_scripts_dir", + help = "File path to the directory containing functions used in this script", required = TRUE) +parser$add_argument("--path_metabolite_groups", dest = "path_metabolite_groups", + help = "File path to the directory containing the metabolite groups", required = TRUE) + +args <- parser$parse_args() # load in function scripts -source(paste0(export_scripts_dir, "generate_excel_functions.R")) +source(paste0(args$export_scripts_dir, "generate_excel_functions.R")) # set the number of digits for floats options(digits = 16) @@ -35,7 +42,7 @@ perc <- 5 outlier_threshold <- 2 # load HMDB rlvnc table -load(hmdb_rlvc_file) +load(args$hmdb_rlvc_file) # load outlist object load("AdductSums_combined.RData") @@ -63,7 +70,7 @@ wb_intensities <- openxlsx::createWorkbook("SinglePatient") openxlsx::addWorksheet(wb_intensities, sheetname) # Add Z-scores and create plots -if (z_score == 1) { +if (args$z_score == 1) { dir.create(paste0(outdir, "/plots"), showWarnings = FALSE) wb_helix_intensities <- openxlsx::createWorkbook("SinglePatient") openxlsx::addWorksheet(wb_helix_intensities, sheetname) @@ -116,13 +123,13 @@ if (z_score == 1) { # get Helix IDs for extra Excel file metabolite_files <- list.files( - path = paste(path_metabolite_groups, "Diagnostics", sep = "/"), + path = paste(args$path_metabolite_groups, "Diagnostics", sep = "/"), pattern = "*.txt", full.names = FALSE, recursive = FALSE ) metab_df_helix <- NULL for (file_index in seq_along(metabolite_files)) { infile <- metabolite_files[file_index] - metab_list <- read.table(paste(path_metabolite_groups, "Diagnostics", infile, sep = "/"), + metab_list <- read.table(paste(args$path_metabolite_groups, "Diagnostics", infile, sep = "/"), sep = "\t", header = TRUE, quote = "" ) metab_df_helix <- rbind(metab_df_helix, metab_list) @@ -238,7 +245,7 @@ if (z_score == 1) { plots_present = TRUE ) openxlsx::writeData(wb_helix_intensities, sheet = 1, outlist_helix, startCol = 1) - openxlsx::saveWorkbook(wb_helix_intensities, paste0(outdir, "/Helix_", project, ".xlsx"), overwrite = TRUE) + openxlsx::saveWorkbook(wb_helix_intensities, paste0(outdir, "/Helix_", args$project, ".xlsx"), overwrite = TRUE) rm(wb_helix_intensities) # reorder outlist for Excel file @@ -262,6 +269,6 @@ if (z_score == 1) { # write Excel file openxlsx::writeData(wb_intensities, sheet = 1, outlist, startCol = 1) -openxlsx::saveWorkbook(wb_intensities, paste0(outdir, "/", project, ".xlsx"), overwrite = TRUE) +openxlsx::saveWorkbook(wb_intensities, paste0(outdir, "/", args$project, ".xlsx"), overwrite = TRUE) rm(wb_intensities) unlink("plots", recursive = TRUE) diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index 552a8ee9..bab44eb1 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -18,6 +18,11 @@ process GenerateExcel { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $analysis_id $relevance_file $params.zscore $params.export_scripts_dir $params.path_metabolite_groups + Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R \\ + --project $analysis_id \\ + --hmdb_rlvc_file $relevance_file \\ + --z_score $params.zscore \\ + --export_scripts_dir $params.export_scripts_dir \\ + --path_metabolite_groups $params.path_metabolite_groups """ } From 47539c10503dd619059f696c1d5bb51fbfee2b65 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 1 Sep 2025 15:53:30 +0200 Subject: [PATCH 07/28] Added argparse to GenerateQCOutput and fixed linting issues --- DIMS/GenerateQCOutput.R | 56 +++++++++++++++++++++++----------------- DIMS/GenerateQCOutput.nf | 13 +++++----- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/DIMS/GenerateQCOutput.R b/DIMS/GenerateQCOutput.R index 321c6ec0..fa1131e3 100644 --- a/DIMS/GenerateQCOutput.R +++ b/DIMS/GenerateQCOutput.R @@ -1,3 +1,4 @@ +library("argparse") library("ggplot2") library("reshape2") library("openxlsx") @@ -6,23 +7,30 @@ suppressMessages(library("dplyr")) # set the number of digits for floats options(digits = 16) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "GenerateQCOutput") -init_file <- cmd_args[1] -project <- cmd_args[2] -dims_matrix <- cmd_args[3] -z_score <- cmd_args[4] -sst_components_file <- cmd_args[5] -export_scripts_dir <- cmd_args[6] +parser$add_argument("--init_file_path", dest = "init_file", + help = "File path for the init RData file", required = TRUE) +parser$add_argument("--project", dest = "project", + help = "Project name", required = TRUE) +parser$add_argument("--matrix", dest = "dims_matrix", + help = "The matrix used, e.g. Plasma, Research, ...", required = TRUE) +parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer", + help = "Numeric value, z-score = 1", required = TRUE) +parser$add_argument("--sst_components_file", dest = "sst_components_file", + help = "File path to the SST components file", required = TRUE) +parser$add_argument("--export_scripts_dir", dest = "export_scripts_dir", + help = "File path to the directory containing functions used in this script", required = TRUE) + +args <- parser$parse_args() outdir <- "./" # load in function scripts -source(paste0(export_scripts_dir, "generate_qc_output_functions.R")) +source(paste0(args$export_scripts_dir, "generate_qc_output_functions.R")) # load init files -load(init_file) +load(args$init_file) # load outlist from GenerateExcel load("outlist.RData") # load combined adducts for each scanmodus @@ -38,7 +46,7 @@ dir.create(paste0(outdir, "/plots"), showWarnings = FALSE) control_label <- "C" #### CHECK NUMBER OF CONTROLS #### -if (z_score == 1) { +if (args$z_score == 1) { file_name <- "Check_number_of_controls.txt" min_num_controls <- 25 check_number_of_controls(outlist, min_num_controls, file_name) @@ -67,14 +75,14 @@ outlist_tot_neg <- outlist_tot_neg[, samples_both_modes] outlist_tot_pos <- outlist_tot_pos[, samples_both_modes] # Retrieve IS summed adducts -is_summed <- get_internal_standards(is_list, "summed", repl_pattern, dims_matrix, rundate, project) +is_summed <- get_internal_standards(is_list, "summed", repl_pattern, args$dims_matrix, rundate, args$project) # Retrieve IS positive mode -is_pos <- get_internal_standards(is_list, "pos", outlist_tot_pos, dims_matrix, rundate, project) +is_pos <- get_internal_standards(is_list, "pos", outlist_tot_pos, args$dims_matrix, rundate, args$project) # Retrieve IS negative mode -is_neg <- get_internal_standards(is_list, "neg", outlist_tot_neg, dims_matrix, rundate, project) +is_neg <- get_internal_standards(is_list, "neg", outlist_tot_neg, args$dims_matrix, rundate, args$project) # Save results -save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) +save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", args$project, "_IS_results.RData")) # number of samples, for plotting length and width sample_count <- length(repl_pattern) @@ -138,7 +146,7 @@ is_sum_selection <- c( ) # add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum) -if (dims_matrix == "DBS") { +if (args$dims_matrix == "DBS") { add_min_intens_lines <- TRUE hline_data_neg <- data.frame( @@ -155,7 +163,7 @@ if (dims_matrix == "DBS") { int_line = c(1300000, 2500000, 500000, 1800000, 1400000), HMDB_name = is_sum_selection ) -} else if (dims_matrix == "Plasma") { +} else if (args$dims_matrix == "Plasma") { add_min_intens_lines <- TRUE hline_data_neg <- data.frame( @@ -239,7 +247,7 @@ patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\ positive_controls_index <- grepl(pattern = paste(patterns, collapse = "|"), column_list) positive_control_list <- column_list[positive_controls_index] -if (z_score == 1) { +if (args$z_score == 1) { # find if one or more positive control samples are missing pos_contr_warning <- c() if (all(sapply(c("^P1002", "^P1003", "^P1005"), @@ -280,16 +288,16 @@ if (z_score == 1) { positive_control$Zscore <- as.numeric(positive_control$Zscore) # extra information added to excel for future reference. made in beginning of this script - positive_control$Matrix <- dims_matrix + positive_control$Matrix <- args$dims_matrix positive_control$Rundate <- rundate - positive_control$Project <- project + positive_control$Project <- args$project # Save results - save(positive_control, file = paste0(outdir, "/", project, "_positive_control.RData")) + save(positive_control, file = paste0(outdir, "/", args$project, "_positive_control.RData")) # round the Z-scores to 2 digits positive_control$Zscore <- round_df(positive_control$Zscore, 2) write.xlsx(positive_control, - file = paste0(outdir, "/", project, "_positive_control.xlsx"), + file = paste0(outdir, "/", args$project, "_positive_control.xlsx"), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE ) } @@ -315,7 +323,7 @@ is_neg_intensities <- get_is_intensities(outlist_tot_neg, is_codes = is_codes) is_pos_intensities <- get_is_intensities(outlist_tot_pos, is_codes = is_codes) # SST components. -sst_comp <- read.csv(sst_components_file, header = TRUE, sep = "\t") +sst_comp <- read.csv(args$sst_components_file, header = TRUE, sep = "\t") sst_list <- outlist %>% filter(HMDB_code %in% sst_comp$HMDB_ID) sst_colnrs <- grep("P1001", colnames(sst_list)) @@ -352,7 +360,7 @@ setColWidths(wb, 3, cols = 1, widths = 24) addWorksheet(wb, "SST components") openxlsx::writeData(wb, sheet = 4, sst_list_intensities) setColWidths(wb, 4, cols = 1:3, widths = 24) -xlsx_name <- paste0(outdir, "/", project, "_IS_SST.xlsx") +xlsx_name <- paste0(outdir, "/", args$project, "_IS_SST.xlsx") openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) rm(wb) diff --git a/DIMS/GenerateQCOutput.nf b/DIMS/GenerateQCOutput.nf index eb17301f..802e4f3d 100644 --- a/DIMS/GenerateQCOutput.nf +++ b/DIMS/GenerateQCOutput.nf @@ -20,11 +20,12 @@ process GenerateQCOutput { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateQCOutput.R $init_file \ - $analysis_id \ - $params.matrix \ - $params.zscore \ - $params.sst_components_file \ - $params.export_scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/GenerateQCOutput.R \\ + --init_file_path $init_file \\ + --project $analysis_id \\ + --matrix $params.matrix \\ + --z_score $params.zscore \\ + --sst_components_file $params.sst_components_file \\ + --export_scripts_dir $params.export_scripts_dir """ } From c94c011a94ba7d16460d305d075bfb70effe0085 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 1 Sep 2025 16:26:07 +0200 Subject: [PATCH 08/28] Changed contaner to dims:1.4 --- DIMS/CollectFilled.nf | 2 +- DIMS/FillMissing.nf | 2 +- DIMS/GenerateBreaks.nf | 2 +- DIMS/GenerateExcel.nf | 2 +- DIMS/GenerateQCOutput.nf | 2 +- DIMS/GenerateViolinPlots.nf | 2 +- DIMS/HMDBparts.nf | 2 +- DIMS/HMDBparts_main.nf | 2 +- DIMS/MakeInit.nf | 2 +- DIMS/PeakFinding.nf | 2 +- DIMS/PeakGrouping.nf | 2 +- DIMS/SpectrumPeakFinding.nf | 2 +- DIMS/SumAdducts.nf | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf index 53b19676..f556a8b4 100644 --- a/DIMS/CollectFilled.nf +++ b/DIMS/CollectFilled.nf @@ -1,7 +1,7 @@ process CollectFilled { tag "DIMS CollectFilled" label 'CollectFilled' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf index b0d18505..7678c95e 100644 --- a/DIMS/FillMissing.nf +++ b/DIMS/FillMissing.nf @@ -1,7 +1,7 @@ process FillMissing { tag "DIMS FillMissing ${peakgrouplist_file}" label 'FillMissing' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index af617a8d..290c4be1 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -1,7 +1,7 @@ process GenerateBreaks { tag "DIMS GenerateBreaks" label 'GenerateBreaks' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index bab44eb1..21fefd4a 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -1,7 +1,7 @@ process GenerateExcel { tag "DIMS GenerateExcel" label 'GenerateExcel' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/GenerateQCOutput.nf b/DIMS/GenerateQCOutput.nf index 802e4f3d..6e4c9940 100644 --- a/DIMS/GenerateQCOutput.nf +++ b/DIMS/GenerateQCOutput.nf @@ -1,7 +1,7 @@ process GenerateQCOutput { tag "DIMS GenerateQCOutput" label 'GenerateQCOutput' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf index 1c4b532d..6136ec3a 100755 --- a/DIMS/GenerateViolinPlots.nf +++ b/DIMS/GenerateViolinPlots.nf @@ -1,7 +1,7 @@ process GenerateViolinPlots { tag "DIMS GenerateViolinPlots" label 'GenerateViolinPlots' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf index 5f19f750..4b1fa5a6 100644 --- a/DIMS/HMDBparts.nf +++ b/DIMS/HMDBparts.nf @@ -1,7 +1,7 @@ process HMDBparts { tag "DIMS HMDBparts" label 'HMDBparts' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf index b38bac08..658baeb9 100644 --- a/DIMS/HMDBparts_main.nf +++ b/DIMS/HMDBparts_main.nf @@ -1,7 +1,7 @@ process HMDBparts_main { tag "DIMS HMDBparts_main" label 'HMDBparts_main' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf index 7aae0e46..d323315e 100644 --- a/DIMS/MakeInit.nf +++ b/DIMS/MakeInit.nf @@ -1,7 +1,7 @@ process MakeInit { tag "DIMS MakeInit" label 'MakeInit' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf index 0097d128..0e25893c 100644 --- a/DIMS/PeakFinding.nf +++ b/DIMS/PeakFinding.nf @@ -1,7 +1,7 @@ process PeakFinding { tag "DIMS PeakFinding ${rdata_file}" label 'PeakFinding' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf index 0ecb18c1..7f3f17e4 100644 --- a/DIMS/PeakGrouping.nf +++ b/DIMS/PeakGrouping.nf @@ -1,7 +1,7 @@ process PeakGrouping { tag "DIMS PeakGrouping ${hmdbpart_file}" label 'PeakGrouping' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf index 35902658..a8ae5a88 100644 --- a/DIMS/SpectrumPeakFinding.nf +++ b/DIMS/SpectrumPeakFinding.nf @@ -1,7 +1,7 @@ process SpectrumPeakFinding { tag "DIMS SpectrumPeakFinding" label 'SpectrumPeakFinding' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf index 514567d4..524ef4a8 100644 --- a/DIMS/SumAdducts.nf +++ b/DIMS/SumAdducts.nf @@ -1,7 +1,7 @@ process SumAdducts { tag "DIMS SumAdducts" label 'SumAdducts' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: From 553da7083c850c6b339ada04d7049543d9b7b30b Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 24 Nov 2025 13:14:15 +0100 Subject: [PATCH 09/28] Added argparse to MakeInit --- DIMS/MakeInit.R | 24 ++++++++++++++---------- DIMS/MakeInit.nf | 4 +++- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 44d49962..d5f05702 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,14 +1,18 @@ -## adapted from makeInit in old pipeline +# load required packages +library("argparse") -# define parameters -args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "MakeInit") -sample_sheet <- read.csv(args[1], sep = "\t") -nr_replicates <- as.numeric(args[2]) +parser$add_argument("--sample_sheet", dest = "sample_sheet", + help = "Samplesheet txt file", required = TRUE) +parser$add_argument("--nr_replicates", dest = "nr_replicates", type = "integer", + help = "Number of replicates, numeric value", required = TRUE) -sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) -nr_sample_groups <- length(sample_names) / nr_replicates -group_names <- trimws(as.vector(unlist(sample_sheet[2]))) +args <- parser$parse_args() + +sample_names <- trimws(as.vector(unlist(args$sample_sheet[1]))) +nr_sample_groups <- length(sample_names) / args$nr_replicates +group_names <- trimws(as.vector(unlist(args$sample_sheet[2]))) group_names <- gsub("[^-.[:alnum:]]", "_", group_names) group_names_unique <- unique(group_names) @@ -16,8 +20,8 @@ group_names_unique <- unique(group_names) repl_pattern <- c() for (sample_group in 1:nr_sample_groups) { tmp <- c() - for (repl in nr_replicates:1) { - index <- ((sample_group * nr_replicates) - repl) + 1 + for (repl in args$nr_replicates:1) { + index <- ((sample_group * args$nr_replicates) - repl) + 1 tmp <- c(tmp, sample_names[index]) } repl_pattern <- c(repl_pattern, list(tmp)) diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf index d323315e..793d8f0b 100644 --- a/DIMS/MakeInit.nf +++ b/DIMS/MakeInit.nf @@ -13,6 +13,8 @@ process MakeInit { script: """ - Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates + Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R \\ + --samplesheet $samplesheet \\ + --nr_replicates $nr_replicates """ } From dc3e80f83d6b3c8e0ebd29146965381333938776 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 24 Nov 2025 13:19:09 +0100 Subject: [PATCH 10/28] Added argparse to CollectFilled and fixed linting issues --- DIMS/CollectFilled.R | 25 ++++++++++++++----------- DIMS/CollectFilled.nf | 4 +++- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index 4bd408e0..265db5d3 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -1,14 +1,17 @@ -## adapted from 10-collectSamplesFilled.R +# load package +library("argparse") -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "CollectFilled") -scripts_dir <- cmd_args[1] -ppm <- as.numeric(cmd_args[2]) -z_score <- as.numeric(cmd_args[3]) +parser$add_argument("--scripts_dir", dest = "scripts_dir", + help = "File path to the directory containing functions used in this script", required = TRUE) +parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer", + help = "Numeric value, z-score = 1", required = TRUE) -source(paste0(scripts_dir, "merge_duplicate_rows.R")) -source(paste0(scripts_dir, "calculate_zscores.R")) +args <- parser$parse_args() + +source(paste0(args$scripts_dir, "merge_duplicate_rows.R")) +source(paste0(args$scripts_dir, "calculate_zscores.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") @@ -17,7 +20,7 @@ for (scanmode in scanmodes) { filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled")) # load files and combine into one object outlist_total <- NULL - for (file_nr in 1:length(filled_files)) { + for (file_nr in seq_along(filled_files)) { peakgrouplist_filled <- get(load(filled_files[file_nr])) outlist_total <- rbind(outlist_total, peakgrouplist_filled) } @@ -29,7 +32,7 @@ for (scanmode in scanmodes) { pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) # calculate Z-scores - if (z_score == 1) { + if (args$z_score == 1) { outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE) nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) @@ -47,7 +50,7 @@ for (scanmode in scanmodes) { outlist_stats_more <- cbind(outlist_stats_more, tmp) outlist_total <- outlist_stats_more } - + # make a copy of the outlist outlist_ident <- outlist_total # take care of NAs in theormz_noise diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf index f556a8b4..1f174448 100644 --- a/DIMS/CollectFilled.nf +++ b/DIMS/CollectFilled.nf @@ -14,6 +14,8 @@ process CollectFilled { script: """ - Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R \\ + --scripts_dir $params.scripts_dir \\ + --z_score $params.zscore """ } From 768517c76a8ee3f67858ebc848be7d2a112d62d1 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 24 Nov 2025 13:19:48 +0100 Subject: [PATCH 11/28] Added argparse to GenerateBreaks and fixed linting issues --- DIMS/GenerateBreaks.R | 34 ++++++++++++++++++---------------- DIMS/GenerateBreaks.nf | 5 ++++- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index d07fd203..0943c327 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -1,15 +1,17 @@ -## adapted from 1-generateBreaksFwhm.HPC.R ## - -# load required package +# load required packages +load("argparse") suppressPackageStartupMessages(library("xcms")) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "GenerateBreaks") + +parser$add_argument("--raw_file", dest = "raw_filepath", + help = "File path to a raw file", required = TRUE) +parser$add_argument("--trim", dest = "trim", type = "integer", + help = "Trim precentage, numeric value", required = TRUE) +parser$add_argument("--resolution", dest = "resolution", type = "integer", + help = "Resolution of the MS machine, numeric value", required = TRUE) -filepath <- cmd_args[1] -outdir <- cmd_args[2] -trim <- as.numeric(cmd_args[3]) -resol <- as.numeric(cmd_args[4]) +args <- parser$parse_args() # initialize trim_left_pos <- NULL @@ -21,19 +23,19 @@ breaks_fwhm_avg <- NULL bins <- NULL # read in mzML file -raw_data <- suppressMessages(xcms::xcmsRaw(filepath)) +raw_data <- suppressMessages(xcms::xcmsRaw(args$raw_filepath)) # Get time values for positive and negative scans pos_times <- raw_data@scantime[raw_data@polarity == "positive"] neg_times <- raw_data@scantime[raw_data@polarity == "negative"] # trim (remove) scans at the start and end for positive -trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin -trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind +trim_left_pos <- round(pos_times[length(pos_times) * (args$trim * 1.5)]) # 15% aan het begin +trim_right_pos <- round(pos_times[length(pos_times) * (1 - (args$trim * 0.5))]) # 5% aan het eind # trim (remove) scans at the start and end for negative -trim_left_neg <- round(neg_times[length(neg_times) * trim]) -trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)]) +trim_left_neg <- round(neg_times[length(neg_times) * args$trim]) +trim_right_neg <- round(neg_times[length(neg_times) * (1 - args$trim)]) # Mass range m/z low_mz <- raw_data@mzrange[1] @@ -46,8 +48,8 @@ segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) # determine start and end of each bin. for (i in 1:nr_segments) { start_segment <- segment[i] - end_segment <- segment[i+1] - resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) + end_segment <- segment[i + 1] + resol_mz <- args$resolution * (1 / sqrt(2) ^ (log2(start_segment / 200))) fwhm_segment <- start_segment / resol_mz breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) # average the m/z instead of start value diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index 290c4be1..ac0c2847 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -14,6 +14,9 @@ process GenerateBreaks { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R \\ + --raw_file $mzML_file \\ + --trim $params.trim \\ + --resolution $params.resolution """ } From a29c1f9e54439177851855f065644c5dafcb83f3 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 24 Nov 2025 13:23:18 +0100 Subject: [PATCH 12/28] Added argparse to HMDBparts_main and fixed linting issues --- DIMS/HMDBparts_main.R | 36 ++++++++++++++++++++---------------- DIMS/HMDBparts_main.nf | 4 +++- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R index 14335bf1..c1bb2185 100644 --- a/DIMS/HMDBparts_main.R +++ b/DIMS/HMDBparts_main.R @@ -1,34 +1,38 @@ -## adapted from hmdb_part_adductSums.R +# load package +load("argparse") -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "HMDBparts_main") -db_file <- cmd_args[1] -breaks_file <- cmd_args[2] +parser$add_argument("--hmdb_db_file", dest = "hmdb_db_file", + help = "File path the HMDB databse file", required = TRUE) +parser$add_argument("--breaks_file", dest = "breaks_file", + help = "File containing all the breaks/bins", required = TRUE) -load(db_file) -load(breaks_file) +args <- parser$parse_args() -# Cut up HMDB minus adducts minus isotopes into small parts +load(args$db_file) +load(args$breaks_file) + +# Cut up HMDB minus adducts minus isotopes into small parts scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { if (scanmode == "negative") { column_label <- "MNeg" - HMDB_add_iso <- HMDB_add_iso.Neg + hmdb_add_iso <- HMDB_add_iso.Neg } else if (scanmode == "positive") { column_label <- "Mpos" - HMDB_add_iso <- HMDB_add_iso.Pos + hmdb_add_iso <- HMDB_add_iso.Pos } # filter mass range measured - outlist <- HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] & - HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + outlist <- hmdb_add_iso[which(hmdb_add_iso[, column_label] >= breaks_fwhm[1] & + hmdb_add_iso[, column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] # remove adducts and isotopes, put internal standard at the beginning outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ] outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ] # sort on m/z value - outlist <- outlist[order(outlist[ , column_label]), ] + outlist <- outlist[order(outlist[, column_label]), ] nr_rows <- dim(outlist)[1] # size of hmdb parts in lines: @@ -37,12 +41,12 @@ for (scanmode in scanmodes) { check <- 0 # generate hmdb parts - if (nr_rows >= sub & (floor(nr_rows / sub)) >= 2) { + if (nr_rows >= sub && (floor(nr_rows / sub)) >= 2) { for (i in 1:floor(nr_rows / sub)) { start <- -(sub - 1) + i * sub end <- i * sub outlist_part <- outlist[c(start:end), ] - save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData")) + save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i, ".RData")) } } @@ -53,4 +57,4 @@ for (scanmode in scanmodes) { outlist_part <- outlist[c(start:end), ] save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData")) -} \ No newline at end of file +} diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf index 658baeb9..2ae3457d 100644 --- a/DIMS/HMDBparts_main.nf +++ b/DIMS/HMDBparts_main.nf @@ -14,6 +14,8 @@ process HMDBparts_main { script: """ - Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R $hmdb_db_file $breaks_file + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R \\ + --hmdb_db_file $hmdb_db_file \\ + --breaks_file $breaks_file """ } From d86f644d625089c8b13cb5466f4abc88a10d4660 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 3 Mar 2026 17:19:16 +0100 Subject: [PATCH 13/28] added argument parsing to DIMS/MakeInit step --- DIMS/MakeInit.R | 20 +++++++++++--------- DIMS/MakeInit.nf | 6 +++--- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index d5f05702..f896e9e6 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -3,28 +3,30 @@ library("argparse") parser <- ArgumentParser(description = "MakeInit") -parser$add_argument("--sample_sheet", dest = "sample_sheet", +parser$add_argument("--samplesheet", dest = "sample_sheet", help = "Samplesheet txt file", required = TRUE) parser$add_argument("--nr_replicates", dest = "nr_replicates", type = "integer", help = "Number of replicates, numeric value", required = TRUE) args <- parser$parse_args() -sample_names <- trimws(as.vector(unlist(args$sample_sheet[1]))) -nr_sample_groups <- length(sample_names) / args$nr_replicates -group_names <- trimws(as.vector(unlist(args$sample_sheet[2]))) +sample_sheet <- read.csv(args$sample_sheet, sep = "\t") +sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) +nr_replicates <- args$nr_replicates +nr_sample_groups <- length(sample_names) / nr_replicates +group_names <- trimws(as.vector(unlist(sample_sheet[2]))) group_names <- gsub("[^-.[:alnum:]]", "_", group_names) group_names_unique <- unique(group_names) # generate the replication pattern repl_pattern <- c() for (sample_group in 1:nr_sample_groups) { - tmp <- c() - for (repl in args$nr_replicates:1) { - index <- ((sample_group * args$nr_replicates) - repl) + 1 - tmp <- c(tmp, sample_names[index]) + replicates_persample <- c() + for (repl in nr_replicates:1) { + index <- ((sample_group * nr_replicates) - repl) + 1 + replicates_persample <- c(replicates_persample, sample_names[index]) } - repl_pattern <- c(repl_pattern, list(tmp)) + repl_pattern <- c(repl_pattern, list(replicates_persample)) } names(repl_pattern) <- group_names_unique diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf index 793d8f0b..6c8b3f18 100644 --- a/DIMS/MakeInit.nf +++ b/DIMS/MakeInit.nf @@ -13,8 +13,8 @@ process MakeInit { script: """ - Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R \\ - --samplesheet $samplesheet \\ - --nr_replicates $nr_replicates + Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R \ + --samplesheet $samplesheet \ + --nr_replicates $nr_replicates """ } From 16b5e83d03966ce3d9e622763580a0597fb6ae07 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 3 Mar 2026 18:00:07 +0100 Subject: [PATCH 14/28] added argument parsing to DIMS/GenerateBreaks step --- DIMS/GenerateBreaks.R | 43 ++++++++++++++++++++++++++---------------- DIMS/GenerateBreaks.nf | 9 +++++---- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 0943c327..9e3838da 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -1,18 +1,24 @@ -# load required packages -load("argparse") +# load required package suppressPackageStartupMessages(library("xcms")) +library("argparse") +# define parameters parser <- ArgumentParser(description = "GenerateBreaks") -parser$add_argument("--raw_file", dest = "raw_filepath", - help = "File path to a raw file", required = TRUE) -parser$add_argument("--trim", dest = "trim", type = "integer", - help = "Trim precentage, numeric value", required = TRUE) -parser$add_argument("--resolution", dest = "resolution", type = "integer", - help = "Resolution of the MS machine, numeric value", required = TRUE) +parser$add_argument("--mzML_filepath", dest = "mzML_filepath", + help = "File path to the mzML file used", required = TRUE) +parser$add_argument("--trim_param", dest = "trim_param", + help = "Initial value of trim parameter (typically 0.1)", required = TRUE) +parser$add_argument("--resolution", dest = "resolution", + help = "Value for resolution (typically 140000)", required = TRUE) args <- parser$parse_args() +trim <- as.numeric(args$trim_param) +resol <- as.numeric(args$resolution) +print(trim) +print(resol) + # initialize trim_left_pos <- NULL trim_right_pos <- NULL @@ -23,23 +29,26 @@ breaks_fwhm_avg <- NULL bins <- NULL # read in mzML file -raw_data <- suppressMessages(xcms::xcmsRaw(args$raw_filepath)) +print(args$mzML_filepath) +raw_data <- suppressMessages(xcms::xcmsRaw(args$mzML_filepath)) # Get time values for positive and negative scans pos_times <- raw_data@scantime[raw_data@polarity == "positive"] neg_times <- raw_data@scantime[raw_data@polarity == "negative"] +paint(pos_times) # trim (remove) scans at the start and end for positive -trim_left_pos <- round(pos_times[length(pos_times) * (args$trim * 1.5)]) # 15% aan het begin -trim_right_pos <- round(pos_times[length(pos_times) * (1 - (args$trim * 0.5))]) # 5% aan het eind +trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% at the beginning +trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% at the end # trim (remove) scans at the start and end for negative -trim_left_neg <- round(neg_times[length(neg_times) * args$trim]) -trim_right_neg <- round(neg_times[length(neg_times) * (1 - args$trim)]) +trim_left_neg <- round(neg_times[length(neg_times) * trim]) +trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)]) # Mass range m/z low_mz <- raw_data@mzrange[1] high_mz <- raw_data@mzrange[2] +print(high_mz) # determine number of segments (bins) nr_segments <- 2 * (high_mz - low_mz) @@ -47,9 +56,10 @@ segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) # determine start and end of each bin. for (i in 1:nr_segments) { + print(i) start_segment <- segment[i] - end_segment <- segment[i + 1] - resol_mz <- args$resolution * (1 / sqrt(2) ^ (log2(start_segment / 200))) + end_segment <- segment[i+1] + resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) fwhm_segment <- start_segment / resol_mz breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) # average the m/z instead of start value @@ -59,5 +69,6 @@ for (i in 1:nr_segments) { } # generate output file -save(breaks_fwhm, breaks_fwhm_avg, trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "breaks.fwhm.RData") +save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData") +save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData") save(high_mz, file = "highest_mz.RData") diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index ac0c2847..72eb6af4 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -10,13 +10,14 @@ process GenerateBreaks { output: path('breaks.fwhm.RData'), emit: breaks + path('trim_params.RData'), emit: trim_params path('highest_mz.RData'), emit: highest_mz script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R \\ - --raw_file $mzML_file \\ - --trim $params.trim \\ - --resolution $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R \ + --mzML_filepath $mzML_file \ + --trim_param $params.trim \ + --resolution $params.resolution """ } From 26cb22257b76eedea156cb21af715d42e1dd7e17 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 3 Mar 2026 18:22:41 +0100 Subject: [PATCH 15/28] removed print statements --- DIMS/GenerateBreaks.R | 6 ------ 1 file changed, 6 deletions(-) diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 9e3838da..95fc30c2 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -16,8 +16,6 @@ args <- parser$parse_args() trim <- as.numeric(args$trim_param) resol <- as.numeric(args$resolution) -print(trim) -print(resol) # initialize trim_left_pos <- NULL @@ -29,13 +27,11 @@ breaks_fwhm_avg <- NULL bins <- NULL # read in mzML file -print(args$mzML_filepath) raw_data <- suppressMessages(xcms::xcmsRaw(args$mzML_filepath)) # Get time values for positive and negative scans pos_times <- raw_data@scantime[raw_data@polarity == "positive"] neg_times <- raw_data@scantime[raw_data@polarity == "negative"] -paint(pos_times) # trim (remove) scans at the start and end for positive trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% at the beginning @@ -48,7 +44,6 @@ trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)]) # Mass range m/z low_mz <- raw_data@mzrange[1] high_mz <- raw_data@mzrange[2] -print(high_mz) # determine number of segments (bins) nr_segments <- 2 * (high_mz - low_mz) @@ -56,7 +51,6 @@ segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) # determine start and end of each bin. for (i in 1:nr_segments) { - print(i) start_segment <- segment[i] end_segment <- segment[i+1] resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) From e8681e66cfdc96a3e158bc079eb636c1a04fdf7a Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 3 Mar 2026 18:55:18 +0100 Subject: [PATCH 16/28] added argument parsing for DIMS/HMDBparts_main step --- DIMS/HMDBparts_main.R | 4 ++-- DIMS/HMDBparts_main.nf | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R index c1bb2185..67e3b2fd 100644 --- a/DIMS/HMDBparts_main.R +++ b/DIMS/HMDBparts_main.R @@ -1,5 +1,5 @@ # load package -load("argparse") +library("argparse") parser <- ArgumentParser(description = "HMDBparts_main") @@ -10,7 +10,7 @@ parser$add_argument("--breaks_file", dest = "breaks_file", args <- parser$parse_args() -load(args$db_file) +load(args$hmdb_db_file) load(args$breaks_file) # Cut up HMDB minus adducts minus isotopes into small parts diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf index 2ae3457d..a079753b 100644 --- a/DIMS/HMDBparts_main.nf +++ b/DIMS/HMDBparts_main.nf @@ -14,8 +14,8 @@ process HMDBparts_main { script: """ - Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R \\ - --hmdb_db_file $hmdb_db_file \\ - --breaks_file $breaks_file + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R \ + --hmdb_db_file $hmdb_db_file \ + --breaks_file $breaks_file """ } From c763e04e30786e3450ad7dd886ecab460fff4939 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 3 Mar 2026 19:09:53 +0100 Subject: [PATCH 17/28] added argument parsing for DIMS/HMDBparts step --- DIMS/HMDBparts.R | 125 +++++++++++++++++----------------------------- DIMS/HMDBparts.nf | 6 ++- 2 files changed, 52 insertions(+), 79 deletions(-) diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R index 232028e6..be9088f6 100755 --- a/DIMS/HMDBparts.R +++ b/DIMS/HMDBparts.R @@ -1,32 +1,54 @@ -# adapted from hmdb_parts.R +# load package +library("argparse") # define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "HMDBparts") -db_file <- cmd_args[1] -breaks_file <- cmd_args[2] -standard_run <- cmd_args[4] +parser$add_argument("--hmdb_db_file", dest = "hmdb_db_file", + help = "File path for the HMDB databse file", required = TRUE) +parser$add_argument("--breaks_file", dest = "breaks_file", + help = "File containing all the breaks for binning intensities", required = TRUE) +parser$add_argument("--standard_run", dest = "standard_run", + help = "Parameter indicating whether the dataset has m/z between 70 and 610", required = TRUE) +parser$add_argument("--hmdb_parts_path", dest = "hmdb_parts_path", + help = "Path to pre-generated HMDB files for standard run", required = TRUE) + +args <- parser$parse_args() # load file with binning breaks -load(breaks_file) +load(args$breaks_file) min_mz <- round(breaks_fwhm[1]) max_mz <- round(breaks_fwhm[length(breaks_fwhm)]) # In case of a standard run use external HMDB parts # m/z is approximately 70 to 600: set limits between 68-71 for min and 599-610 for max -if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) { - # skip generating HMDB parts - hmdb_parts_path <- cmd_args[3] - # find all files containing hmdb in file name - hmdb_parts <- list.files(hmdb_parts_path, pattern = "hmdb") +if (args$standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) { + # skip generating HMDB parts; copy all files containing hmdb in file name from hmdb_parts_path + hmdb_parts <- list.files(args$hmdb_parts_path, pattern = "hmdb") for (hmdb_file in hmdb_parts) { - file.copy(paste(hmdb_parts_path, hmdb_file, sep = "/"), "./", recursive = TRUE) + file.copy(paste(args$hmdb_parts_path, hmdb_file, sep = "/"), "./", recursive = TRUE) } } else { # generate HMDB parts in case of non-standard mz range - load(db_file) - ppm <- as.numeric(cmd_args[5]) - + load(args$hmdb_db_file) + + # determine segments of m/z for HMDB parts; smaller parts for m/z < 100 + mz_segments <- c() + segment_start <- min_mz + segment_end <- min_mz + 5 + while (segment_end < max_mz) { + if (segment_start < 100) { + segment_size = 5 + } else { + segment_size = 10 + } + mz_segments <- c(mz_segments, segment_start) + segment_start <- segment_start + segment_size + segment_end <- segment_end + segment_size + } + #last segment + mz_segments <- c(mz_segments, max_mz) + scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { if (scanmode == "negative") { @@ -38,73 +60,20 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < } # filter mass range meassured - hmdb_add_iso = hmdb_add_iso[which(hmdb_add_iso[ , column_label] >= breaks_fwhm[1] & - hmdb_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + hmdb_add_iso = hmdb_add_iso[which(hmdb_add_iso[ , column_label] >= min_mz & + hmdb_add_iso[ , column_label] <= max_mz), ] # sort on mass - outlist <- hmdb_add_iso[order(as.numeric(hmdb_add_iso[ , column_label])),] - nr_rows <- dim(outlist)[1] - - # maximum number of rows per file - sub <- 20000 - end <- 0 - last_line <- sub - check <- 0 - outlist_part <- NULL + sorted_hmdb_add_iso <- hmdb_add_iso[order(as.numeric(hmdb_add_iso[ , column_label])),] + nr_rows <- dim(sorted_hmdb_add_iso)[1] # create parts and save to file - if (nr_rows < sub) { - outlist_part <- outlist - save(outlist_part, file = paste0(scanmode, "_hmdb.1.RData")) - } else if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) { - for (i in 2:floor(nr_rows / sub) - 1) { - start <- -(sub - 1) + i * sub - end <- i * sub - - if (i > 1){ - outlist_i = outlist[c(start:end),] - nr_moved = 0 - # Use ppm to replace border to avoid cut within peak group - while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / - as.numeric(outlist_i[1, column_label]) < ppm) { - outlist_part <- rbind(outlist_part, outlist_i[1, ]) - outlist_i <- outlist_i[-1, ] - nr_moved <- nr_moved + 1 - } - - save(outlist_part, file = paste(scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = "")) - check <- check + dim(outlist_part)[1] - - outlist_part <- outlist_i - last_line <- dim(outlist_part)[1] - - } else { - outlist_part <- outlist[c(start:end),] - } - } - - start <- end + 1 - end <- nr_rows - outlist_i <- outlist[c(start:end), ] - nr_moved <- 0 - - if (!is.null(outlist_part)) { - # Calculate ppm and replace border, avoid cut within peak group - while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / - as.numeric(outlist_i[1, column_label]) < ppm) { - outlist_part <- rbind(outlist_part, outlist_i[1, ]) - outlist_i <- outlist_i[-1, ] - nr_moved <- nr_moved + 1 - } - - save(outlist_part, file = paste0(scanmode, "_hmdb.", i, ".RData")) - check <- check + dim(outlist_part)[1] - } - - outlist_part <- outlist_i - save(outlist_part, file = paste0(scanmode, "_hmdb.", i + 1, ".RData")) - check <- check + dim(outlist_part)[1] + for (mz_part_index in 1:(length(mz_segments) - 1)) { + mz_start <- mz_segments[mz_part_index] + mz_end <- mz_segments[mz_part_index + 1] + outlist_part <- sorted_hmdb_add_iso[sorted_hmdb_add_iso[ , column_label] > mz_start & + sorted_hmdb_add_iso[ , column_label] <= mz_end, ] + save(outlist_part, file = paste0(scanmode, "_hmdb.", mz_part_index, ".RData")) } } } - diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf index 4b1fa5a6..6c214915 100644 --- a/DIMS/HMDBparts.nf +++ b/DIMS/HMDBparts.nf @@ -13,6 +13,10 @@ process HMDBparts { script: """ - Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R \ + --hmdb_db_file $hmdb_db_file \ + --breaks_file $breaks_file \ + --standard_run $params.standard_run \ + --hmdb_parts_path $params.hmdb_parts_files """ } From fed4e3b2b8908d8240ddb4dcf64bfda27e93be43 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 5 Mar 2026 12:27:19 +0100 Subject: [PATCH 18/28] fixed loading trim parameters from separate file --- DIMS/AssignToBins.R | 7 +++++-- DIMS/AssignToBins.nf | 9 +++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 7028b495..e02b8ceb 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -8,12 +8,15 @@ parser$add_argument("--mzML_filepath", dest = "mzml_filepath", help = "File path for the mzML file", required = TRUE) parser$add_argument("--breaks_filepath", dest = "breaks_filepath", help = "File path for the breaks RData file", required = TRUE) +parser$add_argument("--trim_params_filepath", dest = "trim_params_filepath", + help = "File path for the trim parameters", required = TRUE) args <- parser$parse_args() -# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, -# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos +# load breaks_file: contains breaks_fwhm and breaks_fwhm_avg load(args$breaks_filepath) +# load trim parameters trim_left_neg, trim_left_pos, trim_right_neg and trim_right_pos +load(args$trim_params_filepath) # get sample name techrep_name <- sub("\\..*$", "", basename(args$mzml_filepath)) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index df3b089a..bcf2510e 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -5,7 +5,7 @@ process AssignToBins { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(val(file_id), path(mzML_file), path(breaks_file)) + tuple(val(file_id), path(mzML_file), path(breaks_file), path(trim_params_file)) output: path("${file_id}.RData"), emit: rdata_file @@ -13,9 +13,10 @@ process AssignToBins { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R \\ - --mzML_filepath $mzML_file \\ - --breaks_filepath $breaks_file + Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R \ + --mzML_filepath $mzML_file \ + --breaks_filepath $breaks_file \ + --trim_params_filepath $trim_params_file """ } From ed7b5954359e6aa93feb8b5f7e3115dbc3bb5492 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 5 Mar 2026 13:27:01 +0100 Subject: [PATCH 19/28] added argument parsing to DIMS/EvaluateTics step --- DIMS/EvaluateTics.R | 158 +++++++++++++++++++ DIMS/EvaluateTics.nf | 34 ++++ DIMS/preprocessing/evaluate_tics_functions.R | 101 ++++++++++++ 3 files changed, 293 insertions(+) create mode 100644 DIMS/EvaluateTics.R create mode 100644 DIMS/EvaluateTics.nf create mode 100644 DIMS/preprocessing/evaluate_tics_functions.R diff --git a/DIMS/EvaluateTics.R b/DIMS/EvaluateTics.R new file mode 100644 index 00000000..9ed8cc78 --- /dev/null +++ b/DIMS/EvaluateTics.R @@ -0,0 +1,158 @@ +# load packages +library("ggplot2") +library("gridExtra") +library("argparse") + +parser <- ArgumentParser(description = "EvaluateTics") + +parser$add_argument("--samplesheet_file", dest = "samplesheet_file", + help = "Parsed samplesheet RData file", required = TRUE) +parser$add_argument("--nr_replicates", dest = "nr_replicates", + help = "Number of technical replicates per biological sample", required = TRUE) +parser$add_argument("--analysis_id", dest = "analysis_id", + help = "Name of the dataset", required = TRUE) +parser$add_argument("--matrix", dest = "matrix", + help = "Matrix used (e.g. Plasma, DBS, Research)", required = TRUE) +parser$add_argument("--highest_mz_file", dest = "highest_mz_file", + help = "File containing highest m/z value in dataset", required = TRUE) +parser$add_argument("--trim_params_file", dest = "trim_params_file", + help = "File containing values of trim parameters", required = TRUE) +parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", + help = "File path to the directory containing functions used", required = TRUE) + +args <- parser$parse_args() + +nr_replicates <- as.numeric(args$nr_replicates) +run_name <- args$analysis_id +dims_matrix <- args$matrix + +# load functions +source(paste0(args$preprocessing_scripts_dir, "evaluate_tics_functions.R")) + +# get the value for the highest m/z in the dataset +highest_mz <- get(load(args$highest_mz_file)) + +# load init_file: contains repl_pattern +load(args$samplesheet_file) + +# load trim_params_file: contains trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos +load(args$trim_params_file) + +# set the threshold for accepting or rejecting technical replicates +thresh2remove <- 1000000000 +# lower the threshold for non Plasma matrices +if (dims_matrix != "Plasma") { + thresh2remove <- 1000000000 +} +# lower the threshold for DBS or high m/z specific +if (dims_matrix == "DBS") { + thresh2remove <- 500000000 +} +if (highest_mz > 700) { + thresh2remove <- 1000000 +} + +# find out which technical replicates are below the threshold +remove_tech_reps <- find_bad_replicates(repl_pattern, thresh2remove) +print(remove_tech_reps) + +# negative scan mode +remove_neg <- remove_tech_reps$neg +repl_pattern_filtered <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) +save(repl_pattern_filtered, file = "negative_repl_pattern.RData") + +# positive scan mode +remove_pos <- remove_tech_reps$pos +repl_pattern_filtered <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) +save(repl_pattern_filtered, file = "positive_repl_pattern.RData") + +# get an overview of suitable technical replicates for both scan modes +allsamples_techreps_neg <- get_overview_tech_reps(repl_pattern_filtered, "negative") +allsamples_techreps_pos <- get_overview_tech_reps(repl_pattern_filtered, "positive") +allsamples_techreps_both_scanmodes <- rbind(allsamples_techreps_pos, allsamples_techreps_neg) +write.table(allsamples_techreps_both_scanmodes, + file = "replicates_per_sample.txt", + col.names = FALSE, + row.names = FALSE, + sep = "," +) + + +## generate TIC plots +# get all txt files +tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt") +all_samps <- sub("_TIC\\..*$", "", basename(tic_files)) + +# determine maximum intensity +highest_tic_max <- 0 +for (file in tic_files) { + tic <- read.table(file) + this_tic_max <- max(tic$tic_intensity) + if (this_tic_max > highest_tic_max) { + highest_tic_max <- this_tic_max + max_sample <- sub("_TIC\\..*$", "", basename(file)) + } +} + +# create a list with information for all TIC plots +tic_plot_list <- list() +plot_nr <- 0 +for (sample_nr in c(1:length(repl_pattern))) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) + sample_name <- names(repl_pattern)[sample_nr] + for (file_nr in 1:length(tech_reps)) { + plot_nr <- plot_nr + 1 + # read file with retention time, intensity and dims_threshold values + repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt")) + # get threshold values per technical replicate + dims_thresh_pos <- repl1_nr[1, "threshold"] + dims_thresh_neg <- repl1_nr[nrow(repl1_nr), "threshold"] + # for replicates with bad TIC, determine what color the border of the plot should be + bad_color_pos <- tech_reps[file_nr] %in% remove_pos + bad_color_neg <- tech_reps[file_nr] %in% remove_neg + if (bad_color_neg & bad_color_pos) { + plot_color <- "#F8766D" + } else if (bad_color_pos) { + plot_color <- "#ED8141" + } else if (bad_color_neg) { + plot_color <- "#BF80FF" + } else { + plot_color <- "white" + } + tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) + + geom_line(linewidth = 0.3) + + geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_left_pos, col = "red", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) + + geom_segment(x = trim_left_pos, y = dims_thresh_pos, xend = trim_right_pos, yend = dims_thresh_pos, colour = "green", lty = 2) + + geom_segment(x = trim_left_neg, y = dims_thresh_neg, xend = trim_right_neg, yend = dims_thresh_neg, colour = "blue", lty = 2) + + labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], " || ", sample_name)) + + theme(plot.background = element_rect(fill = plot_color), + axis.text = element_text(size = 4), + axis.title = element_text(size = 4), + plot.title = element_text(size = 6)) + tic_plot_list[[plot_nr]] <- tic_plot + } +} + +# create a layout matrix dependent on number of replicates +layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE) +# put TIC plots in matrix +tic_plot_pdf <- marrangeGrob( + grobs = tic_plot_list, + nrow = 10, ncol = nr_replicates, + layout_matrix = layout, + top = quote(paste( + "TICs of run", run_name, + " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", + g, "/", npages + )) +) + +# save to file +ggsave(filename = paste0(run_name, "_TICplots.pdf"), + tic_plot_pdf, width = 21, height = 29.7, units = "cm") + + diff --git a/DIMS/EvaluateTics.nf b/DIMS/EvaluateTics.nf new file mode 100644 index 00000000..3f91a13e --- /dev/null +++ b/DIMS/EvaluateTics.nf @@ -0,0 +1,34 @@ +process EvaluateTics { + tag "DIMS EvaluateTics" + label 'EvaluateTics' + container = 'docker://umcugenbioinf/dims:1.4' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(rdata_file) + path(tic_txt_files) + path(init_file) + val(analysis_id) + path(highest_mz_file) + path(trim_params_file) + + output: + path('*_repl_pattern.RData'), emit: pattern_files + path('replicates_per_sample.txt'), emit: sample_techreps + path('miss_infusions_negative.txt') + path('miss_infusions_positive.txt') + path('*_TICplots.pdf'), emit: tic_plots_pdf + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/EvaluateTics.R \ + --samplesheet_file $init_file \ + --nr_replicates $params.nr_replicates \ + --analysis_id $analysis_id \ + --matrix $params.matrix \ + --highest_mz_file $highest_mz_file \ + --trim_params_file $trim_params_file \ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir + """ +} + diff --git a/DIMS/preprocessing/evaluate_tics_functions.R b/DIMS/preprocessing/evaluate_tics_functions.R new file mode 100644 index 00000000..df0dbcf4 --- /dev/null +++ b/DIMS/preprocessing/evaluate_tics_functions.R @@ -0,0 +1,101 @@ +# EvaluateTics functions +find_bad_replicates <- function(repl_pattern, thresh2remove) { + #' Find technical replicates with a total intensity below a threshold + #' + #' @param repl_pattern: List of samples with corresponding technical replicates (strings) + #' @param thresh2remove: Threshold value for acceptance or rejection of total intensity (integer) + #' + #' @return remove_tech_reps: Array of rejected technical replicates (strings) + + remove_pos <- NULL + remove_neg <- NULL + cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") + for (sample_nr in 1:length(repl_pattern)) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) + for (file_nr in 1:length(tech_reps)) { + load(paste0(tech_reps[file_nr], ".RData")) + cat("\n\nParsing", tech_reps[file_nr]) + # positive scan mode: determine whether sum of intensities is above threshold + cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) + if (sum(peak_list$pos[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_pos <- c(remove_pos, tech_reps[file_nr]) + } + # negative scan mode: determine whether sum of intensities is above threshold + cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) + if (sum(peak_list$neg[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_neg <- c(remove_neg, tech_reps[file_nr]) + } + } + } + cat("\n") + # write information on miss_infusions for both scan modes + write.table(remove_pos, + file = paste0("miss_infusions_positive.txt"), + row.names = FALSE, + col.names = FALSE, + sep = "\t" + ) + write.table(remove_neg, + file = paste0("miss_infusions_negative.txt"), + row.names = FALSE, + col.names = FALSE, + sep = "\t" + ) + + # combine removed technical replicates from pos and neg + remove_tech_reps <- list(pos = remove_pos, neg = remove_neg) + return(remove_tech_reps) +} + +remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { + #' Remove technical replicates with insufficient quality from a biological sample + #' + #' @param bad_samples: Array of technical replicates of insufficient quality (strings) + #' @param repl_pattern: List of samples with corresponding technical replicates (strings) + #' @param nr_replicates: Number of technical replicates per biological sample (integer) + #' + #' @return repl_pattern_filtered: list of technical replicates of sufficient quality (strings) + + # collect list of samples to remove from replication pattern + remove_from_group <- NULL + for (sample_nr in 1:length(repl_pattern)){ + repl_pattern_1sample <- repl_pattern[[sample_nr]] + remove <- NULL + for (file_nr in 1:length(repl_pattern_1sample)) { + if (repl_pattern_1sample[file_nr] %in% bad_samples) { + remove <- c(remove, file_nr) + } + } + if (length(remove) == nr_replicates) { + remove_from_group <- c(remove_from_group, sample_nr) + } + if (!is.null(remove)) { + repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove] + } + } + if (length(remove_from_group) != 0) { + repl_pattern_filtered <- repl_pattern[-remove_from_group] + } else { + repl_pattern_filtered <- repl_pattern + } + return(repl_pattern_filtered) +} + +get_overview_tech_reps <- function(repl_pattern_filtered, scanmode) { + #' Create an overview of technical replicates with sufficient quality from a biological sample + #' + #' @param repl_pattern_filtered: List of samples with corresponding technical replicates (strings) + #' @param scanmode: Scan mode "positive" or "negative" (string) + #' + #' @return allsamples_techreps_scanmode: Matrix of technical replicates of sufficient quality (strings) + + allsamples_techreps_scanmode <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered)) + for (sample_nr in 1:length(repl_pattern_filtered)) { + allsamples_techreps_scanmode[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr] + allsamples_techreps_scanmode[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";") + } + allsamples_techreps_scanmode[, 3] <- scanmode + return(allsamples_techreps_scanmode) +} From f7ef463e5b3fc2ba79f3275e1fce78700fff2b83 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 5 Mar 2026 16:25:14 +0100 Subject: [PATCH 20/28] added argument parsing for new PeakFinding step --- DIMS/PeakFinding.R | 108 +++++---- DIMS/PeakFinding.nf | 12 +- DIMS/preprocessing/peak_finding_functions.R | 245 ++++++++++++++++++++ 3 files changed, 316 insertions(+), 49 deletions(-) create mode 100644 DIMS/preprocessing/peak_finding_functions.R diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index 697f8f7e..99bfafc5 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -1,53 +1,69 @@ -## adapted from 4-peakFinding.R +# load packages +library(dplyr) +library("argparse") + +parser <- ArgumentParser(description = "PeakFinding") + +parser$add_argument("--rdata_file", dest = "rdata_file", + help = "RData file containing binned intenstity per technical replicate", required = TRUE) +parser$add_argument("--samplesheet", dest = "techreps_scanmode", + help = "Samplesheet containing information on technical replicates and biological samples", required = TRUE) +parser$add_argument("--resolution", dest = "resol", + help = "Value for resolution (typically 140000)", required = TRUE) +parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", + help = "File path to the directory containing functions used", required = TRUE) + +args <- parser$parse_args() # define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) - -sample_file <- cmd_args[1] -breaks_file <- cmd_args[2] -resol <- as.numeric(cmd_args[3]) -scripts_dir <- cmd_args[4] -thresh <- 2000 -outdir <- "./" - -# load in function scripts -source(paste0(scripts_dir, "do_peakfinding.R")) -source(paste0(scripts_dir, "check_overlap.R")) -source(paste0(scripts_dir, "search_mzrange.R")) -source(paste0(scripts_dir, "fit_optim.R")) -source(paste0(scripts_dir, "fit_gaussian.R")) -source(paste0(scripts_dir, "fit_init.R")) -source(paste0(scripts_dir, "get_fwhm.R")) -source(paste0(scripts_dir, "get_stdev.R")) -source(paste0(scripts_dir, "optimize_gaussfit.R")) -source(paste0(scripts_dir, "fit_peaks.R")) -source(paste0(scripts_dir, "fit_gaussians.R")) -source(paste0(scripts_dir, "estimate_area.R")) -source(paste0(scripts_dir, "get_fit_quality.R")) -source(paste0(scripts_dir, "check_overlap.R")) -source(paste0(scripts_dir, "sum_curves.R")) -source(paste0(scripts_dir, "within_ppm.R")) - -load(breaks_file) - -# Load output of AverageTechReplicates for a sample -sample_avgtechrepl <- get(load(sample_file)) -if (grepl("_pos", sample_file)) { - scanmode <- "positive" -} else if (grepl("_neg", sample_file)) { - scanmode <- "negative" -} +resol <- as.numeric(args$resolution) +# use fixed theshold between noise and signal for peak +peak_thresh <- 2000 + +# source functions script +source(paste0(args$preprocessing_scripts_dir, "peak_finding_functions.R")) + +# Load output of AssignToBins (peak_list) for a technical replicate +load(args$rdata_file) +techrepl_name <- colnames(peak_list$pos)[1] + +# load list of technical replicates per sample that passed threshold filter +techreps_passed <- read.table("replicates_per_sample.txt", sep = ",") # Initialize options(digits = 16) -# Number used to calculate area under Gaussian curve -int_factor <- 1 * 10^5 -# Initial value used to estimate scaling parameter -scale <- 2 -width <- 1024 -height <- 768 -# run the findPeaks function +# do peak finding +scanmodes <- c("positive", "negative") +for (scanmode in scanmodes) { + # get intensities for scan mode + if (scanmode == "positive") { + ints_perscanmode <- peak_list$pos + } else if (scanmode == "negative") { + ints_perscanmode <- peak_list$neg + } + + # check whether technical replicate has passed threshold filter for this scanmode + techreps_scanmode <- techreps_passed[grep(scanmode, techreps_passed[, 3]), ] + # if techrep is ok, it will be found. If not, skip this techrep. + if (length(grep(techrepl_name, techreps_scanmode)) == 0) { + break + } -# do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) -do_peakfinding(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) + # put mz and intensities into dataframe + ints_fullrange <- as.data.frame(cbind(mz = as.numeric(rownames(ints_perscanmode)), + int = as.numeric(ints_perscanmode))) + + # look for m/z range for all peaks + regions_of_interest <- search_regions_of_interest(ints_fullrange) + + # fit Gaussian curve and calculate integrated area under curve + integrated_peak_df <- integrate_peaks(ints_fullrange, regions_of_interest, resol, peak_thresh) + + # add sample name to dataframe + integrated_peak_df <- as.data.frame(cbind(samplenr = techrepl_name, integrated_peak_df)) + + # save output to file + save(integrated_peak_df, file = paste0(techrepl_name, "_", scanmode, ".RData")) + write.table(integrated_peak_df, file = paste0(techrepl_name, "_", scanmode, ".txt"), sep = "\t", row.names = FALSE) +} diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf index 0e25893c..9eb283f1 100644 --- a/DIMS/PeakFinding.nf +++ b/DIMS/PeakFinding.nf @@ -5,13 +5,19 @@ process PeakFinding { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(path(rdata_file), path(breaks_file)) + path(rdata_file) + each path(sample_techreps) output: - path '*tive.RData' + path '*tive.RData', emit: peaklist_rdata, optional: true + path '*tive.txt', optional: true script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R \ + --rdata_file $rdata_file \ + --samplesheet $sample_techreps \ + --resolution $params.resolution \ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir """ } diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R new file mode 100644 index 00000000..6ad389a3 --- /dev/null +++ b/DIMS/preprocessing/peak_finding_functions.R @@ -0,0 +1,245 @@ +# functions for peak finding + +search_regions_of_interest <- function(ints_fullrange) { + #' Divide the full m/z range into regions of interest with indices + #' + #' @param ints_fullrange: Matrix with m/z values and intensities (float) + #' + #' @return regions_of_interest: matrix of m/z regions of interest (integer) + + # find indices where intensity is not equal to zero + nonzero_positions <- as.vector(which(ints_fullrange$int != 0)) + + # find regions of interest (look for consecutive numbers) + regions_of_interest_consec <- seqToIntervals(nonzero_positions) + # add length of regions of interest + regions_of_interest_diff <- regions_of_interest_consec[, 2] - regions_of_interest_consec[, 1] + 1 + regions_of_interest_length <- cbind(regions_of_interest_consec, length = regions_of_interest_diff) + # remove short lengths; a peak should have at least 3 data points + if (any(regions_of_interest_length[, "length"] < 3)) { + regions_of_interest_gte3 <- regions_of_interest_length[-which(regions_of_interest_length[, "length"] < 3), ] + } else { + regions_of_interest_gte3 <- regions_of_interest_length + } + # test for length of roi (region of interest). If length is greater than 11, break up into separate rois + remove_roi_index <- c() + new_rois_all <- regions_of_interest_gte3[0, ] + for (roi_nr in 1:nrow(regions_of_interest_gte3)) { + if (regions_of_interest_gte3[roi_nr, "length"] > 11) { + roi <- ints_fullrange[(regions_of_interest_gte3[roi_nr, "from"]:regions_of_interest_gte3[roi_nr, "to"]), ] + roi_intrange <- as.numeric(roi$int) + # look for local minima that separate the peaks + local_min_positions <- which(diff(sign(diff(roi_intrange))) == 2) + 1 + if (length(local_min_positions) > 0) { + remove_roi_index <- c(remove_roi_index, roi_nr) + # find new indices for rois after splitting + start_pos <- regions_of_interest_gte3[roi_nr, "from"] + new_rois <- data.frame(from = 0, to = 0, length = 0) + new_rois_splitroi <- regions_of_interest_gte3[0, ] + for (local_min_index in 1:length(local_min_positions)) { + new_rois[, 1] <- start_pos + new_rois[, 2] <- start_pos + local_min_positions[local_min_index] + new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1 + new_rois_splitroi <- rbind(new_rois_splitroi, new_rois) + start_pos <- new_rois[, 2] + } + # intensities after last local minimum + new_rois[, 1] <- start_pos + new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"] + new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1 + new_rois_splitroi <- rbind(new_rois_splitroi, new_rois) + # append + new_rois_all <- rbind(new_rois_all, new_rois_splitroi) + } else { + # if there are no local minima, all intensities belong to one peak. + remove_roi_index <- c(remove_roi_index, roi_nr) + # look for maximum and take 5 intensities to the left and right + max_index <- which(roi_intrange == max(roi_intrange)) + max_pos <- as.numeric(rownames(roi)[max_index]) + new_roi <- data.frame(from = 0, to = 0, length = 0) + new_roi[, 1] <- max_pos - 5 + new_roi[, 2] <- max_pos + 5 + new_roi[, 3] <- new_roi[, 2] - new_roi[, 1] + 1 + # append + new_rois_all <- rbind(new_rois_all, new_roi) + } + } + } + + # remove rois that have been split into chunks or shortened + if (length(remove_roi_index) > 0) { + regions_of_interest_minus_short <- regions_of_interest_gte3[-remove_roi_index, ] + } else { + regions_of_interest_minus_short <- regions_of_interest_gte3 + } + # combine remaining rois with info on chunks + regions_of_interest_split <- rbind(regions_of_interest_minus_short, new_rois_all) + # remove regions of interest with short lengths again + if (any(regions_of_interest_split[, "length"] < 3)) { + regions_of_interest_final <- regions_of_interest_split[-which(regions_of_interest_split[, "length"] < 3), ] + } else { + regions_of_interest_final <- regions_of_interest_split + } + + # sort on first index + if (nrow(regions_of_interest_final) > 1){ + regions_of_interest_sorted <- regions_of_interest_final %>% as.data.frame %>% dplyr::arrange(from) + } else { + regions_of_interest_sorted <- regions_of_interest_final + } + + return(regions_of_interest_sorted) +} + +integrate_peaks <- function(ints_fullrange, regions_of_interest, resol, peak_thresh) { + #' Fit Gaussian peak for each region of interest and integrate area under the curve + #' + #' @param ints_fullrange: Named list of intensities (float) + #' @param regions_of_interest: Named list of intensities (float) + #' @param resol: Value for resolution (integer) + #' @param peak_thresh: Value for noise level threshold (integer) # NOT USED YET + #' + #' @return allpeaks_values: matrix of integrated peaks + + # initialize dataframe to store results for all peaks + allpeaks_values <- matrix(0, nrow = nrow(regions_of_interest), ncol = 5) + colnames(allpeaks_values) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + + for (roi_nr in 1:nrow(regions_of_interest)) { + # find m/z values and intensities corresponding to the region of interest + index_range <- regions_of_interest[roi_nr, "from"] : regions_of_interest[roi_nr, "to"] + roi_mzrange <- as.numeric(ints_fullrange$mz[index_range]) + roi_intrange <- as.numeric(ints_fullrange$int[index_range]) + # find m/z value for maximum of peak (mu in Gaussian function) + weighted_mzmean <- weighted.mean(roi_mzrange, roi_intrange) + # find expected peak width at this m/z value + fwhm_mzmed <- get_fwhm(weighted_mzmean, resol) + # calculate sigma for Gaussian curve (https://en.wikipedia.org/wiki/Full_width_at_half_maximum) + sigma_peak <- fwhm_mzmed / 2.355 + # find scale factor for Gaussian. Initial estimate is maximum intensity in roi + # divided by 2 for better correlation with intensities from old PeakFinding method + scale_factor <- max(roi_intrange) / 2 + # fit Gaussian peak. Find intensities according to normal distribution + normal_ints <- scale_factor * gaussfunc(roi_mzrange, weighted_mzmean, sigma_peak) + # sum intensities in roi + sum_ints <- sum(roi_intrange) + sum_gauss <- sum(normal_ints) + # calculate quality of fit + quality_fit <- 1 - sum(abs(normal_ints - roi_intrange)) / sum_ints + # put all values into dataframe + allpeaks_values[roi_nr, "mzmed.pkt"] <- weighted_mzmean + allpeaks_values[roi_nr, "fq"] <- quality_fit + allpeaks_values[roi_nr, "mzmax.pkt"] <- max(roi_mzrange) + allpeaks_values[roi_nr, "mzmin.pkt"] <- min(roi_mzrange) + allpeaks_values[roi_nr, "height.pkt"] <- sum_gauss + } + + # remove peaks with height = 0, look for NA or NaN + remove_na <- which(is.na(as.numeric(allpeaks_values[, "height.pkt"]))) + if (length(remove_na) > 0) { + allpeaks_values <- allpeaks_values[-remove_na, ] + } + + return(allpeaks_values) +} + +# in the next version of the docker image, package R.utils will be included +seqToIntervals <- function(idx) { + #' Find consecutive stretches of numbers + #' function seqToIntervals copied from R.utils library + #' see https://rdrr.io/cran/R.utils/src/R/seqToIntervals.R + #' + #' @param idx: Sequence of indices (integers) + #' + #' @return res: Matrix of start and end positions of consecutive numbers (matrix) + + # Clean up sequence + idx <- as.integer(idx) + idx <- unique(idx) + idx <- sort(idx) + + len_idx <- length(idx) + if (len_idx == 0L) { + res <- matrix(NA_integer_, nrow = 0L, ncol = 2L) + colnames(res) <- c("from", "to") + return(res) + } + + # Identify end points of intervals + diff_idx <- diff(idx) + diff_idx <- (diff_idx > 1) + diff_idx <- which(diff_idx) + nr_intervals <- length(diff_idx) + 1 + + # Allocate return matrix + res <- matrix(NA_integer_, nrow = nr_intervals, ncol = 2L) + colnames(res) <- c("from", "to") + + from_value <- idx[1] + to_value <- from_value - 1 + last_value <- from_value + + count <- 1 + for (running_index in seq_along(idx)) { + value <- idx[running_index] + if (value - last_value > 1) { + to_value <- last_value + res[count, ] <- c(from_value, to_value) + from_value <- value + count <- count + 1 + } + last_value <- value + } + + if (to_value < from_value) { + to_value <- last_value + res[count, ] <- c(from_value, to_value) + } + + return(res) +} + +get_fwhm <- function(query_mass, resol) { + #' Calculate fwhm (full width at half maximum intensity) for a peak + #' + #' @param query_mass: Value for mass (float) + #' @param resol: Value for resolution (integer) + #' + #' @return fwhm: Value for full width at half maximum (float) + + # set aberrant values of query_mass to default value of 200 + if (is.na(query_mass)) { + query_mass <- 200 + } + if (query_mass < 0) { + query_mass <- 200 + } + # calculate resolution at given m/z value + resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200))) + # calculate full width at half maximum + fwhm <- query_mass / resol_mz + + return(fwhm) +} + + +# from https://rdrr.io/cran/rvmethod/src/R/gaussfit.R +#' Gaussian Function from package rvmethod +#' +#' This function returns the unnormalized (height of 1.0) Gaussian curve with a +#' given center and spread. +#' +#' @param x the vector of values at which to evaluate the Gaussian +#' @param mu the center of the Gaussian +#' @param sigma the spread of the Gaussian (must be greater than 0) +#' @return vector of values of the Gaussian +#' @examples x = seq(-4, 4, length.out = 100) +#' y = gaussfunc(x, 0, 1) +#' plot(x, y) +#' +#' @import stats +#' +#' @export +gaussfunc <- function(x, mu, sigma) { + return(exp(-((x - mu) ^ 2) / (2 * (sigma ^ 2)))) +} From 9915f55eee27d10928f4f8aed524cf0cb6ac8314 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 5 Mar 2026 18:00:36 +0100 Subject: [PATCH 21/28] added argument parsing to DIMS/AveragePeaks step --- DIMS/AveragePeaks.R | 46 ++++++++++++++++++++ DIMS/AveragePeaks.nf | 22 ++++++++++ DIMS/preprocessing/average_peaks_functions.R | 41 +++++++++++++++++ 3 files changed, 109 insertions(+) create mode 100644 DIMS/AveragePeaks.R create mode 100644 DIMS/AveragePeaks.nf create mode 100644 DIMS/preprocessing/average_peaks_functions.R diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R new file mode 100644 index 00000000..bdfc6cde --- /dev/null +++ b/DIMS/AveragePeaks.R @@ -0,0 +1,46 @@ +# load required packages +library("dplyr") +library("argparse") + +parser <- ArgumentParser(description = "AveragePeaks") + +parser$add_argument("--sample_id", dest = "sample_name", + help = "Name of a biological sample", required = TRUE) +parser$add_argument("--tech_reps", dest = "tech_reps", + help = "Names of the technical replicates belonging to the biological sample", required = TRUE) +parser$add_argument("--scanmode", dest = "scanmode", + help = "Scan mode (either posiive or negative)", required = TRUE) +parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", + help = "File path to the directory containing functions used", required = TRUE) + +args <- parser$parse_args() + +# define parameters +sample_name <- args$sample_id +tech_reps <- strsplit(args$tech_reps, ";")[[1]] + +# load in function scripts +source(paste0(args$preprocessing_scripts_dir, "average_peaks_functions.R")) + +# Initialize per sample +peaklist_allrepl <- NULL +nr_repl_persample <- 0 +averaged_peaks <- matrix(0, nrow = 0, ncol = 6) +colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + +# load RData files of technical replicates belonging to biological sample +for (file_nr in 1:length(tech_reps)) { + tech_repl_file <- paste0(tech_reps[file_nr], "_", args$scanmode, ".RData") + tech_repl <- get(load(tech_repl_file)) + # combine data for all technical replicates + peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl) +} +# sort on mass +peaklist_allrepl_df <- as.data.frame(peaklist_allrepl) +peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) +peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) +peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt) + +# average over technical replicates +averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, sample_name) +save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", args$scanmode, ".RData")) diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf new file mode 100644 index 00000000..65c136ec --- /dev/null +++ b/DIMS/AveragePeaks.nf @@ -0,0 +1,22 @@ +process AveragePeaks { + tag "DIMS AveragePeaks" + label 'AveragePeaks' + container = 'docker://umcugenbioinf/dims:1.4' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(rdata_files) + tuple val(sample_id), val(tech_reps), val(scanmode) + + output: + path 'AvgPeaks_*.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R \ + --sample_id $sample_id \ + --tech_reps $tech_reps \ + --scanmode $scanmode \ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir + """ +} diff --git a/DIMS/preprocessing/average_peaks_functions.R b/DIMS/preprocessing/average_peaks_functions.R new file mode 100644 index 00000000..beed29bb --- /dev/null +++ b/DIMS/preprocessing/average_peaks_functions.R @@ -0,0 +1,41 @@ +average_peaks_per_sample <- function(peaklist_allrepl_sorted, sample_name) { + #' Average the intensity of peaks that occur in different technical replicates of a biological sample + #' + #' @param peaklist_allrepl_sorted: Dataframe with peaks sorted on median m/z (float) + #' + #' @return averaged_peaks: matrix of averaged peaks (float) + + # initialize + averaged_peaks <- peaklist_allrepl_sorted[0, ] + # set ppm as fixed value, not the same ppm as in peak grouping + ppm_peak <- 2 + + while (nrow(peaklist_allrepl_sorted) > 1) { + # store row numbers + peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted) + # find the peaks in the dataset with corresponding m/z plus or minus tolerance + reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1] + mz_tolerance <- (reference_mass * ppm_peak) / 10^6 + minmz_ref <- reference_mass - mz_tolerance + maxmz_ref <- reference_mass + mz_tolerance + select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref)) + select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ] + nrsamples <- length(select_peak_indices) + # put averaged intensities into a new row + averaged_1peak <- matrix(0, nrow = 1, ncol = 6) + colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + # calculate m/z values for peak group + averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt) + averaged_1peak[1, "fq"] <- nrsamples + averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt) + # remove rownr column and append to averaged_peaks + peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ] + averaged_peaks <- rbind(averaged_peaks, averaged_1peak) + } + # add sample name to first column + averaged_peaks[ , "samplenr"] <- sample_name + + return(averaged_peaks) +} From b8ea927def4749b3c06b0192e42e5128e236b826 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 6 Mar 2026 13:58:54 +0100 Subject: [PATCH 22/28] fixed issue with sample_name in DIMS/AveragePeaks step --- DIMS/AveragePeaks.R | 7 +++---- DIMS/AveragePeaks.nf | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R index bdfc6cde..b6dbb69b 100644 --- a/DIMS/AveragePeaks.R +++ b/DIMS/AveragePeaks.R @@ -4,7 +4,7 @@ library("argparse") parser <- ArgumentParser(description = "AveragePeaks") -parser$add_argument("--sample_id", dest = "sample_name", +parser$add_argument("--sample_name", dest = "sample_name", help = "Name of a biological sample", required = TRUE) parser$add_argument("--tech_reps", dest = "tech_reps", help = "Names of the technical replicates belonging to the biological sample", required = TRUE) @@ -16,7 +16,6 @@ parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts args <- parser$parse_args() # define parameters -sample_name <- args$sample_id tech_reps <- strsplit(args$tech_reps, ";")[[1]] # load in function scripts @@ -42,5 +41,5 @@ peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt) # average over technical replicates -averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, sample_name) -save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", args$scanmode, ".RData")) +averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, args$sample_name) +save(averaged_peaks, file = paste0("AvgPeaks_", args$sample_name, "_", args$scanmode, ".RData")) diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf index 65c136ec..9a7e5824 100644 --- a/DIMS/AveragePeaks.nf +++ b/DIMS/AveragePeaks.nf @@ -14,7 +14,7 @@ process AveragePeaks { script: """ Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R \ - --sample_id $sample_id \ + --sample_name $sample_id \ --tech_reps $tech_reps \ --scanmode $scanmode \ --preprocessing_scripts_dir $params.preprocessing_scripts_dir From bfc997fe4129c74293a4c20deb60ddb69fde0f0d Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 6 Mar 2026 16:02:51 +0100 Subject: [PATCH 23/28] added argument parsing for DIMS/PeakGrouping step --- DIMS/PeakGrouping.R | 38 +++++++++++++++++++++++--------------- DIMS/PeakGrouping.nf | 8 ++++++-- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index 4435b033..93400881 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -1,41 +1,49 @@ -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) - # load required packages library("dplyr") +library("argparse") + +parser <- ArgumentParser(description = "PeakGrouping") + +parser$add_argument("--hmdbpart_file", dest = "hmdbpart_file", + help = "RData file containing part of HMDB database (plus adducts and isotopes)", required = TRUE) +parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", + help = "File path to the directory containing functions used", required = TRUE) +parser$add_argument("--ppm", dest = "ppm", + help = "Value for ppm deviation allowed in peak group annotation", required = TRUE) + +args <- parser$parse_args() -hmdb_part_file <- cmd_args[1] -preprocessing_scripts_dir <- cmd_args[2] -ppm <- as.numeric(cmd_args[3]) +hmdbpart_file <- args$hmdbpart_file +ppm <- as.numeric(args$ppm) # load in function scripts -source(paste0(preprocessing_scripts_dir, "peak_grouping_functions.R")) +source(paste0(args$preprocessing_scripts_dir, "peak_grouping_functions.R")) options(digits = 16) # load part of the HMDB -hmdb_add_iso <- get(load(hmdb_part_file)) +hmdb_add_iso <- get(load(args$hmdbpart_file)) # make sure any internal standard entries are at the top of the dataframe internal_standard_indices <- grep("\\(IS\\)", hmdb_add_iso$CompoundName) if (length(internal_standard_indices) > 0) { hmdb_add_iso <- rbind(hmdb_add_iso[internal_standard_indices, ], hmdb_add_iso[-internal_standard_indices, ]) } -# determine appropriate scanmode based on hmdb_part_file -if (grepl("negative", basename(hmdb_part_file))) { +# determine appropriate scanmode based on hmdbpart_file +if (grepl("negative", basename(hmdbpart_file))) { scanmode <- "negative" column_label <- "MNeg" -} else if (grepl("positive", basename(hmdb_part_file))) { +} else if (grepl("positive", basename(hmdbpart_file))) { scanmode <- "positive" column_label <- "Mpos" } # determine batch number of HMDB part file -batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2] +batch_number <- strsplit(basename(hmdbpart_file), ".", fixed = TRUE)[[1]][2] -# load file with spectrum peaks -spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData") -load(spec_peaks_file) +# load file with averaged peaks +avg_peaks_file <- paste0("AvgPeaks_", scanmode, ".RData") +load(avg_peaks_file) outlist_df <- as.data.frame(outlist_total) outlist_df$mzmed.pkt <- as.numeric(outlist_df$mzmed.pkt) outlist_df$height.pkt <- as.numeric(outlist_df$height.pkt) diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf index 7f3f17e4..d10407f5 100644 --- a/DIMS/PeakGrouping.nf +++ b/DIMS/PeakGrouping.nf @@ -6,13 +6,17 @@ process PeakGrouping { input: path(hmdbpart_file) - each path(spectrumpeak_file) + path(averagedpeaks_file) + each path(pattern_file) output: path '*_identified.RData', emit: grouped_identified script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $hmdbpart_file $params.preprocessing_scripts_dir $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R \ + --hmdbpart_file $hmdbpart_file \ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir \ + --ppm $params.ppm """ } From 9ee9b1be90079b636ed014fe8965e75e5f3adcd8 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 6 Mar 2026 16:16:27 +0100 Subject: [PATCH 24/28] added argument parsing for DIMS/CollectAveraged step --- DIMS/CollectAveraged.R | 13 +++++++++++++ DIMS/CollectAveraged.nf | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 DIMS/CollectAveraged.R create mode 100644 DIMS/CollectAveraged.nf diff --git a/DIMS/CollectAveraged.R b/DIMS/CollectAveraged.R new file mode 100644 index 00000000..68f3ab0c --- /dev/null +++ b/DIMS/CollectAveraged.R @@ -0,0 +1,13 @@ +# for each scan mode, collect all averaged peak lists per biological sample +scanmodes <- c("positive", "negative") +for (scanmode in scanmodes) { + # get list of files + filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, ".RData")) + # load files and combine into one object + outlist_total <- NULL + for (file_nr in 1:length(filled_files)) { + peaklist_averaged <- get(load(filled_files[file_nr])) + outlist_total <- rbind(outlist_total, peaklist_averaged) + } + save(outlist_total, file = paste0("AvgPeaks_", scanmode, ".RData")) +} diff --git a/DIMS/CollectAveraged.nf b/DIMS/CollectAveraged.nf new file mode 100644 index 00000000..2f3d1a70 --- /dev/null +++ b/DIMS/CollectAveraged.nf @@ -0,0 +1,17 @@ +process CollectAveraged { + tag "DIMS CollectAveraged" + label 'CollectAveraged' + container = 'docker://umcugenbioinf/dims:1.4' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(averaged_files) + + output: + path('AvgPeaks*.RData'), emit: averaged_peaks + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/CollectAveraged.R + """ +} From d546a7729abcf68c4c72047b3e88dd0d968c2d82 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 6 Mar 2026 16:48:49 +0100 Subject: [PATCH 25/28] added argument parsing for DIMS/FillMissing step --- DIMS/FillMissing.R | 42 ++++++++++----------- DIMS/FillMissing.nf | 5 ++- DIMS/preprocessing/fill_missing_functions.R | 36 ++++++++++++++++++ 3 files changed, 60 insertions(+), 23 deletions(-) create mode 100644 DIMS/preprocessing/fill_missing_functions.R diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index a76c163b..e349fc0e 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -1,25 +1,22 @@ -## adapted from 9-runFillMissing.R - # define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +library("argparse") + +parser <- ArgumentParser(description = "FillMissing") + +parser$add_argument("--peakgrouplist_file", dest = "peakgrouplist_file", + help = "Peak group list file", required = TRUE) +parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", + help = "File path to the directory containing functions used", required = TRUE) +parser$add_argument("--thresh_noise", dest = "thresh_noise", + help = "Threshold value for noise on peak level (typically 2000)", required = TRUE) + +args <- parser$parse_args() -peakgrouplist_file <- cmd_args[1] -scripts_dir <- cmd_args[2] -thresh <- as.numeric(cmd_args[3]) -resol <- as.numeric(cmd_args[4]) -ppm <- as.numeric(cmd_args[5]) -outdir <- "./" +peakgrouplist_file <- args$peakgrouplist_file +thresh <- as.numeric(args$thresh_noise) # load in function scripts -source(paste0(scripts_dir, "replace_zeros.R")) -source(paste0(scripts_dir, "fit_optim.R")) -source(paste0(scripts_dir, "get_fwhm.R")) -source(paste0(scripts_dir, "get_stdev.R")) -source(paste0(scripts_dir, "estimate_area.R")) -source(paste0(scripts_dir, "optimize_gaussfit.R")) -source(paste0(scripts_dir, "identify_noisepeaks.R")) -source(paste0(scripts_dir, "get_element_info.R")) -source(paste0(scripts_dir, "atomic_info.R")) +source(paste0(args$preprocessing_scripts_dir, "fill_missing_functions.R")) # determine scan mode if (grepl("_pos", peakgrouplist_file)) { @@ -33,12 +30,13 @@ pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) # load peak group list and determine output file name -outpgrlist_identified <- get(load(peakgrouplist_file)) - -outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) +peakgroup_list <- get(load(peakgrouplist_file)) # replace missing values (zeros) with random noise -peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) +peakgrouplist_filled <- fill_missing_intensities(peakgroup_list, repl_pattern, thresh) + +# set name of output file +outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) # save output save(peakgrouplist_filled, file = outputfile_name) diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf index 7678c95e..d52c31ce 100644 --- a/DIMS/FillMissing.nf +++ b/DIMS/FillMissing.nf @@ -13,6 +13,9 @@ process FillMissing { script: """ - Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R \ + --peakgrouplist_file $peakgrouplist_file \ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir \ + --thresh_noise $params.thresh """ } diff --git a/DIMS/preprocessing/fill_missing_functions.R b/DIMS/preprocessing/fill_missing_functions.R new file mode 100644 index 00000000..e1c455d5 --- /dev/null +++ b/DIMS/preprocessing/fill_missing_functions.R @@ -0,0 +1,36 @@ +fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, not_random = FALSE) { + #' Replace intensities that are zero with random value + #' + #' @param peakgroup_list: Peak groups (matrix) + #' @param repl_pattern: Replication pattern (list of strings) + #' @param thresh: Value for threshold between noise and signal (integer) + #' + #' @return final_outlist: peak groups with filled-in intensities (matrix) + + # for unit test, turn off randomness + if (not_random) { + set.seed(123) + } + + # replace missing intensities with random values around threshold + if (!is.null(peakgroup_list)) { + for (sample_index in seq_along(names(repl_pattern))) { + sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]] + zero_intensity <- which(sample_peaks <= 0) + if (!length(zero_intensity)) { + next + } + for (zero_index in seq_along(zero_intensity)) { + peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, + mean = thresh, + sd = 100) + } + } + + # Add column with average intensity; find intensity columns first + int_cols <- which(colnames(peakgroup_list) %in% names(repl_pattern)) + peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, int_cols], 1, mean)) + + return(peakgroup_list) + } +} From 4ed43c484b059c339af0f8e4961d9c41e81627a8 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 6 Mar 2026 16:49:55 +0100 Subject: [PATCH 26/28] added argument parsing for DIMS/CollectFilled step --- DIMS/CollectFilled.R | 30 +++-- DIMS/CollectFilled.nf | 7 +- DIMS/preprocessing/collect_filled_functions.R | 118 ++++++++++++++++++ 3 files changed, 139 insertions(+), 16 deletions(-) create mode 100644 DIMS/preprocessing/collect_filled_functions.R diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index 265db5d3..a0e4136f 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -1,17 +1,21 @@ -# load package +# define parameters library("argparse") parser <- ArgumentParser(description = "CollectFilled") -parser$add_argument("--scripts_dir", dest = "scripts_dir", +parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", help = "File path to the directory containing functions used in this script", required = TRUE) -parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer", - help = "Numeric value, z-score = 1", required = TRUE) +parser$add_argument("--ppm_value", dest = "ppm_value", + help = "Value for allowed ppm deviation (e.g. 5)", required = TRUE) +parser$add_argument("--z_score", dest = "z_score", + help = "Boolean indicating whether Z-scores should be calculated (1) or not (0)", required = TRUE) args <- parser$parse_args() -source(paste0(args$scripts_dir, "merge_duplicate_rows.R")) -source(paste0(args$scripts_dir, "calculate_zscores.R")) +ppm <- as.numeric(args$ppm_value) +z_score <- as.numeric(args$z_score) + +source(paste0(args$preprocessing_scripts_dir, "collect_filled_functions.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") @@ -32,8 +36,8 @@ for (scanmode in scanmodes) { pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) # calculate Z-scores - if (args$z_score == 1) { - outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE) + if (z_score == 1) { + outlist_stats <- calculate_zscores(outlist_total) nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) outlist_stats_more <- cbind( @@ -53,11 +57,11 @@ for (scanmode in scanmodes) { # make a copy of the outlist outlist_ident <- outlist_total - # take care of NAs in theormz_noise - outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0 - outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise) - outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0 - outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise) + # select identified peak groups if ppm deviation is within limits + if (z_score == 1) { + outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev) + outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ] + } # Extra output in Excel-readable format: remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf index 1f174448..2773c1f5 100644 --- a/DIMS/CollectFilled.nf +++ b/DIMS/CollectFilled.nf @@ -14,8 +14,9 @@ process CollectFilled { script: """ - Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R \\ - --scripts_dir $params.scripts_dir \\ - --z_score $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R \ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir \ + --ppm_value $params.ppm \ + --z_score $params.zscore """ } diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R new file mode 100644 index 00000000..979dd1d1 --- /dev/null +++ b/DIMS/preprocessing/collect_filled_functions.R @@ -0,0 +1,118 @@ +# CollectFilled functions + +collapse <- function(column_label, peakgroup_list, index_dup) { + #' Collapse identification info for peak groups with the same mass + #' + #' @param column_label: Name of column in peakgroup_list (string) + #' @param peakgroup_list: Peak group list (matrix) + #' @param index_dup: Index of duplicate peak group (integer) + #' + #' @return collapsed_items: Semicolon-separated list of info (string) + # get the item(s) that need to be collapsed + list_items <- as.vector(peakgroup_list[index_dup, column_label]) + # remove NA + if (length(which(is.na(list_items))) > 0) { + list_items <- list_items[-which(is.na(list_items))] + } + collapsed_items <- paste(list_items, collapse = ";") + return(collapsed_items) +} + +merge_duplicate_rows <- function(peakgroup_list) { + #' Merge identification info for peak groups with the same mass + #' + #' @param peakgroup_list: Peak group list (matrix) + #' + #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) + + options(digits = 16) + collect <- NULL + remove <- NULL + + # check for peak groups with identical mass + index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"])) + + while (length(index_dup) > 0) { + # get the index for the peak group which is double + peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"]) + single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE] + + # use function collapse to concatenate info + single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index) + single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index) + single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index) + single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index) + single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index) + if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA + + # keep track of deduplicated entries + collect <- rbind(collect, single_peakgroup) + remove <- c(remove, peaklist_index) + + # remove current entry from index + index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])] + } + + # remove duplicate entries + if (!is.null(remove)) { + peakgroup_list <- peakgroup_list[-remove, ] + } + # append deduplicated entries + peakgroup_list_dedup <- rbind(peakgroup_list, collect) + return(peakgroup_list_dedup) +} + +calculate_zscores <- function(peakgroup_list) { + #' Calculate Z-scores for peak groups based on average and standard deviation of controls + #' + #' @param peakgroup_list: Peak group list (matrix) + #' @param sort_col: Column to sort on (string) + #' @param adducts: Parameter indicating whether there are adducts in the list (boolean) + #' + #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) + + case_label <- "P" + control_label <- "C" + # get index for new column names + startcol <- ncol(peakgroup_list) + 3 + + # calculate mean and standard deviation for Control group + ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE) + case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE) + int_cols <- c(ctrl_cols, case_cols) + # set all zeros to NA + peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA + ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE] + peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) + peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) + + # set new column names and calculate Z-scores + colnames_zscores <- NULL + for (col_index in int_cols) { + col_name <- colnames(peakgroup_list)[col_index] + colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore")) + zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) - + peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls + peakgroup_list <- cbind(peakgroup_list, zscores_1col) + } + + # apply new column names to columns at end plus avg and sd columns + colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores + + # add ppm deviation column + zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE) + # calculate ppm deviation + for (row_index in seq_len(nrow(peakgroup_list))) { + if (!is.na(peakgroup_list$theormz_HMDB[row_index]) && + !is.null(peakgroup_list$theormz_HMDB[row_index]) && + (peakgroup_list$theormz_HMDB[row_index] != "")) { + peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) - + as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) / + as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index])) + } else { + peakgroup_list$ppmdev[row_index] <- NA + } + } + + return(peakgroup_list) +} From 7b0d9b68cec9b26d1d6dcbb6aed7307142bbd0cf Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 6 Mar 2026 17:19:03 +0100 Subject: [PATCH 27/28] added argument parsing for DIMS/SumAdducts step --- DIMS/SumAdducts.R | 23 +++++++++++++++++------ DIMS/SumAdducts.nf | 5 ++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index 489ae179..bfd40161 100644 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -1,13 +1,24 @@ +# load libraries library("dplyr") -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +library("argparse") + +parser <- ArgumentParser(description = "SumAdducts") + +parser$add_argument("--hmdbpart_main_file", dest = "hmdbpart_main_file", + help = "RData file containing part of HMDB database (only main metabolites)", required = TRUE) +parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", + help = "File path to the directory containing functions used", required = TRUE) +parser$add_argument("--z_score", dest = "z_score", + help = "Boolean indicating whether Z-scores should be calculated (1) or not (0)", required = TRUE) -hmdbpart_main_file <- cmd_args[1] -scripts_dir <- cmd_args[2] -z_score <- as.numeric(cmd_args[3]) +args <- parser$parse_args() + +# define parameters +hmdbpart_main_file <- args$hmdbpart_main_file +z_score <- as.numeric(args$z_score) # load in function scripts -source(paste0(scripts_dir, "sum_intensities_adducts.R")) +source(paste0(args$preprocessing_scripts_dir, "sum_intensities_adducts.R")) if (grepl("positive_hmdb", hmdbpart_main_file)) { scanmode <- "positive" diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf index a667e6e0..6b67c6a6 100644 --- a/DIMS/SumAdducts.nf +++ b/DIMS/SumAdducts.nf @@ -13,6 +13,9 @@ process SumAdducts { script: """ - Rscript ${baseDir}/CustomModules/DIMS/SumAdducts.R $hmdbpart_main_file $params.preprocessing_scripts_dir $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/SumAdducts.R \ + --hmdbpart_main_file $hmdbpart_main_file \ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir \ + --z_score $params.zscore """ } From d62a7025b77328b63379b3113885303db6e9b0ac Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 6 Mar 2026 17:28:10 +0100 Subject: [PATCH 28/28] added argument parsing for DIMS/CollectSumAdducts step --- DIMS/CollectSumAdducts.R | 12 ++++++------ DIMS/CollectSumAdducts.nf | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R index 10752add..2bf4cf0b 100755 --- a/DIMS/CollectSumAdducts.R +++ b/DIMS/CollectSumAdducts.R @@ -1,12 +1,12 @@ -## Combining all AdductSums part files for each scanmode and -# combine intensities if present in both scanmodes -library("argparse") +# load libraries suppressMessages(library("dplyr")) +library("argparse") -parser <- ArgumentParser(description = "GenerateExcel") +# define parameters +parser <- ArgumentParser(description = "CollectSumAdducts") parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", - help = "File path to the directory containing functions used in this script", required = TRUE) + help = "File path to the directory containing functions used", required = TRUE) args <- parser$parse_args() @@ -18,6 +18,6 @@ save(outlist_tot_pos, file = "AdductSums_positive.RData") outlist_tot_neg <- combine_sum_adduct_parts("negative") save(outlist_tot_neg, file = "AdductSums_negative.RData") -# combine intensities of both scanmodi +# combine intensities of both scan modes outlist <- combine_scanmodes_intensities(outlist_tot_pos, outlist_tot_neg) save(outlist, file = "AdductSums_combined.RData") diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf index 06aa82c8..ee8ec229 100644 --- a/DIMS/CollectSumAdducts.nf +++ b/DIMS/CollectSumAdducts.nf @@ -13,7 +13,7 @@ process CollectSumAdducts { script: """ - Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R \\ - --preprocessing_scripts_dir $params.preprocessing_scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R \ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir """ }