UMCUGenetics · ALuesink · Aug 25, 2025 · Aug 25, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
@@ -1,19 +1,25 @@
 # load required packages
+library("argparse")
 suppressPackageStartupMessages(library("xcms"))
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "AssignToBins")
 
-mzml_filepath <- cmd_args[1]
-breaks_filepath <- cmd_args[2]
-resol <- as.numeric(cmd_args[3])
+parser$add_argument("--mzML_filepath", dest = "mzml_filepath",
+                    help = "File path for the mzML file", required = TRUE)
+parser$add_argument("--breaks_filepath", dest = "breaks_filepath",
+                    help = "File path for the breaks RData file", required = TRUE)
+parser$add_argument("--trim_params_filepath", dest = "trim_params_filepath",
+                    help = "File path for the trim parameters", required = TRUE)
 
-# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg,
-# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
-load(breaks_filepath)
+args <- parser$parse_args()
+
+# load breaks_file: contains breaks_fwhm and breaks_fwhm_avg
+load(args$breaks_filepath)
+# load trim parameters trim_left_neg, trim_left_pos, trim_right_neg and trim_right_pos
+load(args$trim_params_filepath)
 
 # get sample name
-techrep_name <- sub("\\..*$", "", basename(mzml_filepath))
+techrep_name <- sub("\\..*$", "", basename(args$mzml_filepath))
 
 options(digits = 16)
 
@@ -26,7 +32,7 @@ neg_bins <- bins
 dims_thresh <- 100
 
 # read in the data for 1 sample
-raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath))
+raw_data <- suppressMessages(xcms::xcmsRaw(args$mzml_filepath))
 
 # Generate a matrix with retention times and intensities
 raw_data_matrix <- xcms::rawMat(raw_data)
@@ -41,13 +47,13 @@ neg_times_trimmed <- neg_times[neg_times > trim_left_neg & neg_times < trim_righ
 # get TIC intensities for areas between trim_left and trim_right
 tic_intensity_persample <- cbind(raw_data@scantime, raw_data@tic)
 colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity")
-tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(pos_times_trimmed) &
-                                             tic_intensity_persample[ , "retention_time"] < max(pos_times_trimmed), ]
-tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(neg_times_trimmed) &
-                                             tic_intensity_persample[ , "retention_time"] < max(neg_times_trimmed), ]
+tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[, "retention_time"] > min(pos_times_trimmed) &
+                                               tic_intensity_persample[, "retention_time"] < max(pos_times_trimmed), ]
+tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[, "retention_time"] > min(neg_times_trimmed) &
+                                               tic_intensity_persample[, "retention_time"] < max(neg_times_trimmed), ]
 # calculate weighted mean of intensities for pos and neg separately
-mean_pos <- weighted.mean(tic_intensity_pos[ , "tic_intensity"], tic_intensity_pos[ , "tic_intensity"])
-mean_neg <- weighted.mean(tic_intensity_neg[ , "tic_intensity"], tic_intensity_neg[ , "tic_intensity"])
+mean_pos <- weighted.mean(tic_intensity_pos[, "tic_intensity"], tic_intensity_pos[, "tic_intensity"])
+mean_neg <- weighted.mean(tic_intensity_neg[, "tic_intensity"], tic_intensity_neg[, "tic_intensity"])
 # intensity per scan should be at least 80% of weighted mean
 dims_thresh_pos <- 0.8 * mean_pos
 dims_thresh_neg <- 0.8 * mean_neg
@@ -67,17 +73,17 @@ neg_raw_data_matrix <- raw_data_matrix[neg_index, ]
 
 # Get index for binning intensity values
 bin_indices_pos <- cut(
-  pos_raw_data_matrix[, "mz"], 
+  pos_raw_data_matrix[, "mz"],
   breaks_fwhm,
-  include.lowest = TRUE, 
-  right = TRUE, 
+  include.lowest = TRUE,
+  right = TRUE,
   labels = FALSE
 )
 bin_indices_neg <- cut(
-  neg_raw_data_matrix[, "mz"], 
-  breaks_fwhm, 
-  include.lowest = TRUE, 
-  right = TRUE, 
+  neg_raw_data_matrix[, "mz"],
+  breaks_fwhm,
+  include.lowest = TRUE,
+  right = TRUE,
   labels = FALSE
 )
 

diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
@@ -1,19 +1,22 @@
 process AssignToBins {
     tag "DIMS AssignToBins ${file_id}"
     label 'AssignToBins'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(val(file_id), path(mzML_file), path(breaks_file))
+       tuple(val(file_id), path(mzML_file), path(breaks_file), path(trim_params_file))
 
     output:
        path("${file_id}.RData"), emit: rdata_file
        path("${file_id}_TIC.txt"), emit: tic_txt_file
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution
+        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R \
+                --mzML_filepath $mzML_file \
+                --breaks_filepath $breaks_file \
+                --trim_params_filepath $trim_params_file
         """
 }
 

diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R
@@ -0,0 +1,45 @@
+# load required packages
+library("dplyr")
+library("argparse")
+
+parser <- ArgumentParser(description = "AveragePeaks")
+
+parser$add_argument("--sample_name", dest = "sample_name",
+                    help = "Name of a biological sample", required = TRUE)
+parser$add_argument("--tech_reps", dest = "tech_reps",
+                    help = "Names of the technical replicates belonging to the biological sample", required = TRUE)
+parser$add_argument("--scanmode", dest = "scanmode",
+                    help = "Scan mode (either posiive or negative)", required = TRUE)
+parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
+                    help = "File path to the directory containing functions used", required = TRUE)
+
+args <- parser$parse_args()
+
+# define parameters
+tech_reps <- strsplit(args$tech_reps, ";")[[1]]
+
+# load in function scripts
+source(paste0(args$preprocessing_scripts_dir, "average_peaks_functions.R"))
+
+# Initialize per sample
+peaklist_allrepl <- NULL
+nr_repl_persample <- 0
+averaged_peaks <- matrix(0, nrow = 0, ncol = 6) 
+colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+
+# load RData files of technical replicates belonging to biological sample
+for (file_nr in 1:length(tech_reps)) {
+  tech_repl_file <- paste0(tech_reps[file_nr], "_", args$scanmode, ".RData")
+  tech_repl <- get(load(tech_repl_file))
+  # combine data for all technical replicates
+  peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl)
+}
+# sort on mass
+peaklist_allrepl_df <- as.data.frame(peaklist_allrepl)
+peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) 
+peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) 
+peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt)
+
+# average over technical replicates
+averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, args$sample_name)
+save(averaged_peaks, file = paste0("AvgPeaks_", args$sample_name, "_", args$scanmode, ".RData"))
diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf
@@ -0,0 +1,22 @@
+process AveragePeaks {
+    tag "DIMS AveragePeaks"
+    label 'AveragePeaks'
+    container = 'docker://umcugenbioinf/dims:1.4'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(rdata_files)
+       tuple val(sample_id), val(tech_reps), val(scanmode)
+
+    output:
+       path 'AvgPeaks_*.RData'
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R \
+                --sample_name $sample_id \
+                --tech_reps $tech_reps \
+                --scanmode $scanmode \
+                --preprocessing_scripts_dir $params.preprocessing_scripts_dir
+        """
+}
diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
@@ -1,28 +1,37 @@
 # adapted from 3-AverageTechReplicates.R
 
 # load packages
+library("argparse")
 library("ggplot2")
 library("gridExtra")
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "AverageTechReplicates")
 
-init_file <- cmd_args[1]
-nr_replicates <- as.numeric(cmd_args[2])
-run_name <- cmd_args[3]
-dims_matrix <- cmd_args[4]
-highest_mz_file <- cmd_args[5]
-highest_mz <- get(load(highest_mz_file))
-breaks_filepath <- cmd_args[6]
-thresh2remove <- as.numeric(cmd_args[7])
+parser$add_argument("--init_filepath", dest = "init_file",
+                    help = "File path for the init RData file", required = TRUE)
+parser$add_argument("-n", "--nr_replicates", dest = "nr_replicates", type = "integer",
+                    help = "Number of replicates", required = TRUE)
+parser$add_argument("--run_name", dest = "run_name",
+                    help = "The run name/analysis ID", required = TRUE)
+parser$add_argument("--matrix", dest = "dims_matrix",
+                    help = "The matrix used, e.g. Plasma, Research, ...")
+parser$add_argument("--highest_mz_file", dest = "highest_mz_file",
+                    help = "File path for the highest Mz RData file", required = TRUE)
+parser$add_argument("--breaks_filepath", dest = "breaks_filepath",
+                    help = "File path for the breaks RData file", required = TRUE)
+
+args <- parser$parse_args()
+
+highest_mz <- get(load(args$highest_mz_file))
+thresh2remove <- 1000000000
 
 remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
   # collect list of samples to remove from replication pattern
   remove_from_group <- NULL
-  for (sample_nr in 1:length(repl_pattern)){
+  for (sample_nr in seq_along(repl_pattern)){
     repl_pattern_1sample <- repl_pattern[[sample_nr]]
     remove <- NULL
-    for (file_nr in 1:length(repl_pattern_1sample)) {
+    for (file_nr in seq_along(repl_pattern_1sample)) {
       if (repl_pattern_1sample[file_nr] %in% bad_samples) {
         remove <- c(remove, file_nr)
       }
@@ -41,11 +50,11 @@ remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
 }
 
 # load init_file: contains repl_pattern
-load(init_file)
+load(args$init_file)
 
 # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg,
 # trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
-load(breaks_filepath)
+load(args$breaks_filepath)
 
 # lower the threshold for non Plasma matrices
 if (dims_matrix != "Plasma") {
@@ -64,15 +73,15 @@ if (highest_mz > 700) {
 remove_neg <- NULL
 remove_pos <- NULL
 cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
-for (sample_nr in 1:length(repl_pattern)) {
+for (sample_nr in seq_along(repl_pattern)) {
   tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
   tech_reps_array_pos <- NULL
   tech_reps_array_neg <- NULL
   sum_neg <- 0
   sum_pos <- 0
   nr_pos <- 0
   nr_neg <- 0
-  for (file_nr in 1:length(tech_reps)) {
+  for (file_nr in seq_along(tech_reps)) {
     load(paste(tech_reps[file_nr], ".RData", sep = ""))
     cat("\n\nParsing", tech_reps[file_nr])
     # negative scanmode
@@ -96,7 +105,7 @@ for (sample_nr in 1:length(repl_pattern)) {
     }
     tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos)
   }
-  # save to file  
+  # save to file
   if (nr_neg != 0) {
     sum_neg[, 1] <- sum_neg[, 1] / nr_neg
     colnames(sum_neg) <- names(repl_pattern)[sample_nr]
@@ -109,25 +118,25 @@ for (sample_nr in 1:length(repl_pattern)) {
   }
 }
 
-pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
+pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, args$nr_replicates)
 repl_pattern_filtered <- pattern_list$pattern
 save(repl_pattern_filtered, file = "negative_repl_pattern.RData")
 write.table(
-  remove_neg, 
-  file = "miss_infusions_negative.txt", 
-  row.names = FALSE, 
-  col.names = FALSE, 
+  remove_neg,
+  file = "miss_infusions_negative.txt",
+  row.names = FALSE,
+  col.names = FALSE,
   sep = "\t"
 )
 
-pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
+pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, args$nr_replicates)
 repl_pattern_filtered <- pattern_list$pattern
 save(repl_pattern_filtered, file = "positive_repl_pattern.RData")
 write.table(
-  remove_pos, 
-  file = "miss_infusions_positive.txt", 
-  row.names = FALSE, 
-  col.names = FALSE, 
+  remove_pos,
+  file = "miss_infusions_positive.txt",
+  row.names = FALSE,
+  col.names = FALSE,
   sep = "\t"
 )
 
@@ -150,10 +159,10 @@ for (file in tic_files) {
 # create a list with information for all TIC plots
 tic_plot_list <- list()
 plot_nr <-  0
-for (sample_nr in c(1:length(repl_pattern))) {
+for (sample_nr in seq_along(repl_pattern)) {
   tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
   sample_name <- names(repl_pattern)[sample_nr]
-  for (file_nr in 1:length(tech_reps)) {
+  for (file_nr in seq_along(tech_reps)) {
     plot_nr <- plot_nr + 1
     # read file with retention time, intensity and dims_threshold values
     repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt"))
@@ -163,7 +172,7 @@ for (sample_nr in c(1:length(repl_pattern))) {
     # for replicates with bad TIC, determine what color the border of the plot should be
     bad_color_pos <- tech_reps[file_nr] %in% remove_pos
     bad_color_neg <- tech_reps[file_nr] %in% remove_neg
-    if (bad_color_neg & bad_color_pos) {
+    if (bad_color_neg && bad_color_pos) {
       plot_color <- "#F8766D"
     } else if (bad_color_pos) {
       plot_color <- "#ED8141"
@@ -191,19 +200,19 @@ for (sample_nr in c(1:length(repl_pattern))) {
 }
 
 # create a layout matrix dependent on number of replicates
-layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE)
+layout <- matrix(1:(10 * args$nr_replicates), 10, args$nr_replicates, TRUE)
 # put TIC plots in matrix
 tic_plot_pdf <- marrangeGrob(
   grobs = tic_plot_list,
-  nrow = 10, ncol = nr_replicates,
+  nrow = 10, ncol = args$nr_replicates,
   layout_matrix = layout,
   top = quote(paste(
-    "TICs of run", run_name,
+    "TICs of run", args$run_name,
     " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ",
     g, "/", npages
   ))
 )
 
 # save to file
-ggsave(filename = paste0(run_name, "_TICplots.pdf"),
+ggsave(filename = paste0(args$run_name, "_TICplots.pdf"),
        tic_plot_pdf, width = 21, height = 29.7, units = "cm")
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
@@ -1,7 +1,7 @@
 process AverageTechReplicates {
     tag "DIMS AverageTechReplicates"
     label 'AverageTechReplicates'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
@@ -23,13 +23,13 @@ process AverageTechReplicates {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \
-                                                                      $params.nr_replicates \
-                                                                      $analysis_id \
-                                                                      $matrix \
-                                                                      $highest_mz_file \
-                                                                      $breaks_file \
-                                                                      $params.threshold_tics
+        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R \\
+            --init_filepath $init_file \\
+            --nr_replicates $params.nr_replicates \\
+            --run_name $analysis_id \\
+            --matrix $matrix \\
+            --highest_mz_file $highest_mz_file \\
+            --breaks_filepath $breaks_file
         """
 }