From d0e71eb4e9268d996e6170f4b2a5786c7f7da476 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 25 Aug 2025 13:09:09 +0200
Subject: [PATCH 01/28] Added argparse to AssignToBins

---
 DIMS/AssignToBins.R  | 22 ++++++++++++++--------
 DIMS/AssignToBins.nf |  6 ++++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index d1c17ba7..88801264 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -1,19 +1,25 @@
 # load required packages
+library("argparse")
 suppressPackageStartupMessages(library("xcms"))
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "AssignToBins")
 
-mzml_filepath <- cmd_args[1]
-breaks_filepath <- cmd_args[2]
-resol <- as.numeric(cmd_args[3])
+parser$add_argument("--mzML_filepath", dest = "mzml_filepath",
+                    help = "File path for the mzML file", required = TRUE)
+parser$add_argument("--breaks_filepath", dest = "breaks_filepath",
+                    help = "File path for the breaks RData file", required = TRUE)
+
+args <- parser$parse_args()
+
+mzml_filepath <- args$mzml_filepath
+breaks_filepath <- args$breaks_filepath
 
 # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg,
 # trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
-load(breaks_filepath)
+load(args$breaks_filepath)
 
 # get sample name
-techrep_name <- sub("\\..*$", "", basename(mzml_filepath))
+techrep_name <- sub("\\..*$", "", basename(args$mzml_filepath))
 
 options(digits = 16)
 
@@ -25,7 +31,7 @@ pos_bins <- bins
 neg_bins <- bins
 
 # read in the data for 1 sample
-raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath))
+raw_data <- suppressMessages(xcms::xcmsRaw(args$mzml_filepath))
 
 # Generate a matrix with retention times and intensities
 raw_data_matrix <- xcms::rawMat(raw_data)
diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
index d0a7098b..df3b089a 100644
--- a/DIMS/AssignToBins.nf
+++ b/DIMS/AssignToBins.nf
@@ -1,7 +1,7 @@
 process AssignToBins {
     tag "DIMS AssignToBins ${file_id}"
     label 'AssignToBins'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
@@ -13,7 +13,9 @@ process AssignToBins {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution
+        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R \\
+            --mzML_filepath $mzML_file \\
+            --breaks_filepath $breaks_file
         """
 }
 

From d5bac97e77696a51a36991b78fee3a35272280ed Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 25 Aug 2025 13:12:57 +0200
Subject: [PATCH 02/28] Removed unused arguments

---
 DIMS/AssignToBins.R | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index 88801264..66cc05ee 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -11,9 +11,6 @@ parser$add_argument("--breaks_filepath", dest = "breaks_filepath",
 
 args <- parser$parse_args()
 
-mzml_filepath <- args$mzml_filepath
-breaks_filepath <- args$breaks_filepath
-
 # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg,
 # trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
 load(args$breaks_filepath)

From 97d5626fab955e63f988bfacd12eea5bf6ba73e8 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 1 Sep 2025 15:49:24 +0200
Subject: [PATCH 03/28] Fixed linting issues

---
 DIMS/AssignToBins.R | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index 66cc05ee..824bc0ce 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -43,13 +43,13 @@ neg_times_trimmed <- neg_times[neg_times > trim_left_neg & neg_times < trim_righ
 # get TIC intensities for areas between trim_left and trim_right
 tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic)
 colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity")
-tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(pos_times_trimmed) &
-                                             tic_intensity_persample[ , "retention_time"] < max(pos_times_trimmed), ]
-tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(neg_times_trimmed) &
-                                             tic_intensity_persample[ , "retention_time"] < max(neg_times_trimmed), ]
+tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[, "retention_time"] > min(pos_times_trimmed) &
+                                               tic_intensity_persample[, "retention_time"] < max(pos_times_trimmed), ]
+tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[, "retention_time"] > min(neg_times_trimmed) &
+                                               tic_intensity_persample[, "retention_time"] < max(neg_times_trimmed), ]
 # calculate weighted mean of intensities for pos and neg separately
-mean_pos <- weighted.mean(tic_intensity_pos[ , "tic_intensity"], tic_intensity_pos[ , "tic_intensity"])
-mean_neg <- weighted.mean(tic_intensity_neg[ , "tic_intensity"], tic_intensity_neg[ , "tic_intensity"])
+mean_pos <- weighted.mean(tic_intensity_pos[, "tic_intensity"], tic_intensity_pos[, "tic_intensity"])
+mean_neg <- weighted.mean(tic_intensity_neg[, "tic_intensity"], tic_intensity_neg[, "tic_intensity"])
 # intensity per scan should be at least 80% of weighted mean
 dims_thresh_pos <- 0.8 * mean_pos
 dims_thresh_neg <- 0.8 * mean_neg
@@ -69,17 +69,17 @@ neg_raw_data_matrix <- raw_data_matrix[neg_index, ]
 
 # Get index for binning intensity values
 bin_indices_pos <- cut(
-  pos_raw_data_matrix[, "mz"], 
+  pos_raw_data_matrix[, "mz"],
   breaks_fwhm,
-  include.lowest = TRUE, 
-  right = TRUE, 
+  include.lowest = TRUE,
+  right = TRUE,
   labels = FALSE
 )
 bin_indices_neg <- cut(
-  neg_raw_data_matrix[, "mz"], 
-  breaks_fwhm, 
-  include.lowest = TRUE, 
-  right = TRUE, 
+  neg_raw_data_matrix[, "mz"],
+  breaks_fwhm,
+  include.lowest = TRUE,
+  right = TRUE,
   labels = FALSE
 )
 
@@ -90,8 +90,8 @@ if (nrow(pos_raw_data_matrix) > 0) {
   pos_raw_data_matrix[is.na(pos_raw_data_matrix[, "intensity"]), "intensity"] <- 0
   # aggregate intensities, calculate mean, use only values above dims_thresh_pos
   aggr_int_pos <- stats::aggregate(pos_raw_data_matrix[, "intensity"],
-				   list(bin_indices_pos),
-				   FUN = function(x) { mean(x[which(x > dims_thresh_pos)]) })
+                                   list(bin_indices_pos),
+                                   FUN = function(x) {mean(x[which(x > dims_thresh_pos)])})
   # set NA to zero in second column
   aggr_int_pos[is.na(aggr_int_pos[, 2]), 2] <- 0
   pos_bins[aggr_int_pos[, 1]] <- aggr_int_pos[, 2]
@@ -101,8 +101,8 @@ if (nrow(neg_raw_data_matrix) > 0) {
   neg_raw_data_matrix[is.na(neg_raw_data_matrix[, "intensity"]), "intensity"] <- 0
   # aggregate intensities, calculate mean, use only values above dims_thresh_neg
   aggr_int_neg <- stats::aggregate(neg_raw_data_matrix[, "intensity"],
-				   list(bin_indices_neg),
-				   FUN = function(x) { mean(x[which(x > dims_thresh_neg)]) })
+                                   list(bin_indices_neg),
+                                   FUN = function(x) {mean(x[which(x > dims_thresh_neg)])})
   # set NA to zero in second column
   aggr_int_neg[is.na(aggr_int_neg[, 2]), 2] <- 0
   neg_bins[aggr_int_neg[, 1]] <- aggr_int_neg[, 2]

From 8a250379adffdeef04f9925354434949478d44c0 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 1 Sep 2025 15:52:13 +0200
Subject: [PATCH 04/28] Added argparse to AverageTechReplicates and fixed
 linting issues

---
 DIMS/AverageTechReplicates.R  | 83 ++++++++++++++++++++---------------
 DIMS/AverageTechReplicates.nf | 15 ++++---
 2 files changed, 55 insertions(+), 43 deletions(-)

diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index 8e13e634..bfacf70e 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -1,28 +1,37 @@
 # adapted from 3-AverageTechReplicates.R
 
 # load packages
+library("argparse")
 library("ggplot2")
 library("gridExtra")
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "AverageTechReplicates")
 
-init_file <- cmd_args[1]
-nr_replicates <- as.numeric(cmd_args[2])
-run_name <- cmd_args[3]
-dims_matrix <- cmd_args[4]
-highest_mz_file <- cmd_args[5]
-highest_mz <- get(load(highest_mz_file))
-breaks_filepath <- cmd_args[6]
+parser$add_argument("--init_filepath", dest = "init_file",
+                    help = "File path for the init RData file", required = TRUE)
+parser$add_argument("-n", "--nr_replicates", dest = "nr_replicates", type = "integer",
+                    help = "Number of replicates", required = TRUE)
+parser$add_argument("--run_name", dest = "run_name",
+                    help = "The run name/analysis ID", required = TRUE)
+parser$add_argument("--matrix", dest = "dims_matrix",
+                    help = "The matrix used, e.g. Plasma, Research, ...")
+parser$add_argument("--highest_mz_file", dest = "highest_mz_file",
+                    help = "File path for the highest Mz RData file", required = TRUE)
+parser$add_argument("--breaks_filepath", dest = "breaks_filepath",
+                    help = "File path for the breaks RData file", required = TRUE)
+
+args <- parser$parse_args()
+
+highest_mz <- get(load(args$highest_mz_file))
 thresh2remove <- 1000000000
 
 remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
   # collect list of samples to remove from replication pattern
   remove_from_group <- NULL
-  for (sample_nr in 1:length(repl_pattern)){
+  for (sample_nr in seq_along(repl_pattern)){
     repl_pattern_1sample <- repl_pattern[[sample_nr]]
     remove <- NULL
-    for (file_nr in 1:length(repl_pattern_1sample)) {
+    for (file_nr in seq_along(repl_pattern_1sample)) {
       if (repl_pattern_1sample[file_nr] %in% bad_samples) {
         remove <- c(remove, file_nr)
       }
@@ -41,14 +50,14 @@ remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
 }
 
 # load init_file: contains repl_pattern
-load(init_file)
+load(args$init_file)
 
 # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg,
 # trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
-load(breaks_filepath)
+load(args$breaks_filepath)
 
 # lower the threshold below which a sample will be removed for DBS and for high m/z
-if (dims_matrix == "DBS") {
+if (args$dims_matrix == "DBS") {
   thresh2remove <- 500000000
 }
 if (highest_mz > 700) {
@@ -60,7 +69,7 @@ if (highest_mz > 700) {
 remove_neg <- NULL
 remove_pos <- NULL
 cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
-for (sample_nr in 1:length(repl_pattern)) {
+for (sample_nr in seq_along(repl_pattern)) {
   tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
   tech_reps_array_pos <- NULL
   tech_reps_array_neg <- NULL
@@ -68,7 +77,7 @@ for (sample_nr in 1:length(repl_pattern)) {
   sum_pos <- 0
   nr_pos <- 0
   nr_neg <- 0
-  for (file_nr in 1:length(tech_reps)) {
+  for (file_nr in seq_along(tech_reps)) {
     load(paste(tech_reps[file_nr], ".RData", sep = ""))
     cat("\n\nParsing", tech_reps[file_nr])
     # negative scanmode
@@ -92,7 +101,7 @@ for (sample_nr in 1:length(repl_pattern)) {
     }
     tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos)
   }
-  # save to file  
+  # save to file
   if (nr_neg != 0) {
     sum_neg[, 1] <- sum_neg[, 1] / nr_neg
     colnames(sum_neg) <- names(repl_pattern)[sample_nr]
@@ -105,25 +114,25 @@ for (sample_nr in 1:length(repl_pattern)) {
   }
 }
 
-pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
+pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, args$nr_replicates)
 repl_pattern_filtered <- pattern_list$pattern
 save(repl_pattern_filtered, file = "negative_repl_pattern.RData")
 write.table(
-  remove_neg, 
-  file = "miss_infusions_negative.txt", 
-  row.names = FALSE, 
-  col.names = FALSE, 
+  remove_neg,
+  file = "miss_infusions_negative.txt",
+  row.names = FALSE,
+  col.names = FALSE,
   sep = "\t"
 )
 
-pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
+pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, args$nr_replicates)
 repl_pattern_filtered <- pattern_list$pattern
 save(repl_pattern_filtered, file = "positive_repl_pattern.RData")
 write.table(
-  remove_pos, 
-  file = "miss_infusions_positive.txt", 
-  row.names = FALSE, 
-  col.names = FALSE, 
+  remove_pos,
+  file = "miss_infusions_positive.txt",
+  row.names = FALSE,
+  col.names = FALSE,
   sep = "\t"
 )
 
@@ -146,10 +155,10 @@ for (file in tic_files) {
 # create a list with information for all TIC plots
 tic_plot_list <- list()
 plot_nr <-  0
-for (sample_nr in c(1:length(repl_pattern))) {
+for (sample_nr in seq_along(repl_pattern)) {
   tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
   sample_name <- names(repl_pattern)[sample_nr]
-  for (file_nr in 1:length(tech_reps)) {
+  for (file_nr in seq_along(tech_reps)) {
     plot_nr <- plot_nr + 1
     # read file with retention time, intensity and dims_threshold values
     repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt"))
@@ -159,7 +168,7 @@ for (sample_nr in c(1:length(repl_pattern))) {
     # for replicates with bad TIC, determine what color the border of the plot should be
     bad_color_pos <- tech_reps[file_nr] %in% remove_pos
     bad_color_neg <- tech_reps[file_nr] %in% remove_neg
-    if (bad_color_neg & bad_color_pos) {
+    if (bad_color_neg && bad_color_pos) {
       plot_color <- "#F8766D"
     } else if (bad_color_pos) {
       plot_color <- "#ED8141"
@@ -175,8 +184,10 @@ for (sample_nr in c(1:length(repl_pattern))) {
       geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) +
       geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) +
       geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) +
-      geom_segment(aes(x = trim_left_pos, y = dims_thresh_pos, xend = trim_right_pos, yend = dims_thresh_pos), colour = "green", lty = 2) + 
-      geom_segment(aes(x = trim_left_neg, y = dims_thresh_neg, xend = trim_right_neg, yend = dims_thresh_neg), colour = "blue", lty = 2) + 
+      geom_segment(aes(x = trim_left_pos, y = dims_thresh_pos, xend = trim_right_pos, yend = dims_thresh_pos),
+                   colour = "green", lty = 2) +
+      geom_segment(aes(x = trim_left_neg, y = dims_thresh_neg, xend = trim_right_neg, yend = dims_thresh_neg),
+                   colour = "blue", lty = 2) +
       labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], "  ||  ", sample_name)) +
       theme(plot.background = element_rect(fill = plot_color),
             axis.text = element_text(size = 4),
@@ -187,19 +198,19 @@ for (sample_nr in c(1:length(repl_pattern))) {
 }
 
 # create a layout matrix dependent on number of replicates
-layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE)
+layout <- matrix(1:(10 * args$nr_replicates), 10, args$nr_replicates, TRUE)
 # put TIC plots in matrix
 tic_plot_pdf <- marrangeGrob(
   grobs = tic_plot_list,
-  nrow = 10, ncol = nr_replicates,
+  nrow = 10, ncol = args$nr_replicates,
   layout_matrix = layout,
   top = quote(paste(
-    "TICs of run", run_name,
+    "TICs of run", args$run_name,
     " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ",
     g, "/", npages
   ))
 )
 
 # save to file
-ggsave(filename = paste0(run_name, "_TICplots.pdf"),
+ggsave(filename = paste0(args$run_name, "_TICplots.pdf"),
        tic_plot_pdf, width = 21, height = 29.7, units = "cm")
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
index b521a19f..12606cf7 100644
--- a/DIMS/AverageTechReplicates.nf
+++ b/DIMS/AverageTechReplicates.nf
@@ -1,7 +1,7 @@
 process AverageTechReplicates {
     tag "DIMS AverageTechReplicates"
     label 'AverageTechReplicates'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
@@ -23,12 +23,13 @@ process AverageTechReplicates {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \
-                                                                      $params.nr_replicates \
-                                                                      $analysis_id \
-                                                                      $matrix \
-                                                                      $highest_mz_file \
-                                                                      $breaks_file
+        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R \\
+            --init_filepath $init_file \\
+            --nr_replicates $params.nr_replicates \\
+            --run_name $analysis_id \\
+            --matrix $matrix \\
+            --highest_mz_file $highest_mz_file \\
+            --breaks_filepath $breaks_file
         """
 }
 

From 4fc8385d81fe9c20dac6f0376a6e1741d92aef96 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 1 Sep 2025 15:52:31 +0200
Subject: [PATCH 05/28] Added argparse to CollectSumAdducts and fixed linting
 issues

---
 DIMS/CollectSumAdducts.R  | 11 +++++++----
 DIMS/CollectSumAdducts.nf |  5 +++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R
index 28b5bf0d..10752add 100755
--- a/DIMS/CollectSumAdducts.R
+++ b/DIMS/CollectSumAdducts.R
@@ -1,13 +1,16 @@
 ## Combining all AdductSums part files for each scanmode and
 # combine intensities if present in both scanmodes
+library("argparse")
 suppressMessages(library("dplyr"))
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "GenerateExcel")
 
-preprocessing_scripts_dir <- cmd_args[1]
+parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
+                    help = "File path to the directory containing functions used in this script", required = TRUE)
 
-source(paste0(preprocessing_scripts_dir, "collect_sum_adducts_functions.R"))
+args <- parser$parse_args()
+
+source(paste0(args$preprocessing_scripts_dir, "collect_sum_adducts_functions.R"))
 
 # collect all AdductSums part files for each scanmode and save to RData file
 outlist_tot_pos <- combine_sum_adduct_parts("positive")
diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf
index 279cd0e2..06aa82c8 100644
--- a/DIMS/CollectSumAdducts.nf
+++ b/DIMS/CollectSumAdducts.nf
@@ -1,7 +1,7 @@
 process CollectSumAdducts {
     tag "DIMS CollectSumAdducts"
     label 'CollectSumAdducts'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
@@ -13,6 +13,7 @@ process CollectSumAdducts {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R $params.preprocessing_scripts_dir
+        Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R \\
+        --preprocessing_scripts_dir $params.preprocessing_scripts_dir
         """
 }

From f956c9d4820c34ef9e7284981ca7c34d6565c4d4 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 1 Sep 2025 15:53:16 +0200
Subject: [PATCH 06/28] Added argparse to GenerateExcel and fixed linting
 issues

---
 DIMS/GenerateExcel.R  | 35 +++++++++++++++++++++--------------
 DIMS/GenerateExcel.nf |  7 ++++++-
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
index c17f5b03..12ee7cea 100644
--- a/DIMS/GenerateExcel.R
+++ b/DIMS/GenerateExcel.R
@@ -1,4 +1,5 @@
 # load required packages
+library("argparse")
 library("ggplot2")
 library("reshape2")
 library("openxlsx")
@@ -6,17 +7,23 @@ suppressMessages(library("tidyr"))
 suppressMessages(library("dplyr"))
 suppressMessages(library("stringr"))
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "GenerateExcel")
 
-project <- cmd_args[1]
-hmdb_rlvc_file <- cmd_args[2]
-z_score <- as.numeric(cmd_args[3])
-export_scripts_dir <- cmd_args[4]
-path_metabolite_groups <- cmd_args[5]
+parser$add_argument("--project", dest = "project",
+                    help = "Project name", required = TRUE)
+parser$add_argument("--hmdb_rlvc_file", dest = "hmdb_rlvc_file",
+                    help = "File path to the HMDB relevance file", required = TRUE)
+parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer",
+                    help = "Numeric value, z-score = 1", required = TRUE)
+parser$add_argument("--export_scripts_dir", dest = "export_scripts_dir",
+                    help = "File path to the directory containing functions used in this script", required = TRUE)
+parser$add_argument("--path_metabolite_groups", dest = "path_metabolite_groups",
+                    help = "File path to the directory containing the metabolite groups", required = TRUE)
+
+args <- parser$parse_args()
 
 # load in function scripts
-source(paste0(export_scripts_dir, "generate_excel_functions.R"))
+source(paste0(args$export_scripts_dir, "generate_excel_functions.R"))
 
 # set the number of digits for floats
 options(digits = 16)
@@ -35,7 +42,7 @@ perc <- 5
 outlier_threshold <- 2
 
 # load HMDB rlvnc table
-load(hmdb_rlvc_file)
+load(args$hmdb_rlvc_file)
 
 # load outlist object
 load("AdductSums_combined.RData")
@@ -63,7 +70,7 @@ wb_intensities <- openxlsx::createWorkbook("SinglePatient")
 openxlsx::addWorksheet(wb_intensities, sheetname)
 
 # Add Z-scores and create plots
-if (z_score == 1) {
+if (args$z_score == 1) {
   dir.create(paste0(outdir, "/plots"), showWarnings = FALSE)
   wb_helix_intensities <- openxlsx::createWorkbook("SinglePatient")
   openxlsx::addWorksheet(wb_helix_intensities, sheetname)
@@ -116,13 +123,13 @@ if (z_score == 1) {
 
   # get Helix IDs for extra Excel file
   metabolite_files <- list.files(
-    path = paste(path_metabolite_groups, "Diagnostics", sep = "/"),
+    path = paste(args$path_metabolite_groups, "Diagnostics", sep = "/"),
     pattern = "*.txt", full.names = FALSE, recursive = FALSE
   )
   metab_df_helix <- NULL
   for (file_index in seq_along(metabolite_files)) {
     infile <- metabolite_files[file_index]
-    metab_list <- read.table(paste(path_metabolite_groups, "Diagnostics", infile, sep = "/"),
+    metab_list <- read.table(paste(args$path_metabolite_groups, "Diagnostics", infile, sep = "/"),
       sep = "\t", header = TRUE, quote = ""
     )
     metab_df_helix <- rbind(metab_df_helix, metab_list)
@@ -238,7 +245,7 @@ if (z_score == 1) {
     plots_present = TRUE
   )
   openxlsx::writeData(wb_helix_intensities, sheet = 1, outlist_helix, startCol = 1)
-  openxlsx::saveWorkbook(wb_helix_intensities, paste0(outdir, "/Helix_", project, ".xlsx"), overwrite = TRUE)
+  openxlsx::saveWorkbook(wb_helix_intensities, paste0(outdir, "/Helix_", args$project, ".xlsx"), overwrite = TRUE)
   rm(wb_helix_intensities)
 
   # reorder outlist for Excel file
@@ -262,6 +269,6 @@ if (z_score == 1) {
 
 # write Excel file
 openxlsx::writeData(wb_intensities, sheet = 1, outlist, startCol = 1)
-openxlsx::saveWorkbook(wb_intensities, paste0(outdir, "/", project, ".xlsx"), overwrite = TRUE)
+openxlsx::saveWorkbook(wb_intensities, paste0(outdir, "/", args$project, ".xlsx"), overwrite = TRUE)
 rm(wb_intensities)
 unlink("plots", recursive = TRUE)
diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
index 552a8ee9..bab44eb1 100644
--- a/DIMS/GenerateExcel.nf
+++ b/DIMS/GenerateExcel.nf
@@ -18,6 +18,11 @@ process GenerateExcel {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $analysis_id $relevance_file $params.zscore $params.export_scripts_dir $params.path_metabolite_groups
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R \\
+            --project $analysis_id \\
+            --hmdb_rlvc_file $relevance_file \\
+            --z_score $params.zscore \\
+            --export_scripts_dir $params.export_scripts_dir \\
+            --path_metabolite_groups $params.path_metabolite_groups
         """
 }

From 47539c10503dd619059f696c1d5bb51fbfee2b65 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 1 Sep 2025 15:53:30 +0200
Subject: [PATCH 07/28] Added argparse to GenerateQCOutput and fixed linting
 issues

---
 DIMS/GenerateQCOutput.R  | 56 +++++++++++++++++++++++-----------------
 DIMS/GenerateQCOutput.nf | 13 +++++-----
 2 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/DIMS/GenerateQCOutput.R b/DIMS/GenerateQCOutput.R
index 321c6ec0..fa1131e3 100644
--- a/DIMS/GenerateQCOutput.R
+++ b/DIMS/GenerateQCOutput.R
@@ -1,3 +1,4 @@
+library("argparse")
 library("ggplot2")
 library("reshape2")
 library("openxlsx")
@@ -6,23 +7,30 @@ suppressMessages(library("dplyr"))
 # set the number of digits for floats
 options(digits = 16)
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "GenerateQCOutput")
 
-init_file <- cmd_args[1]
-project <- cmd_args[2]
-dims_matrix <- cmd_args[3]
-z_score <- cmd_args[4]
-sst_components_file <- cmd_args[5]
-export_scripts_dir <- cmd_args[6]
+parser$add_argument("--init_file_path", dest = "init_file",
+                    help = "File path for the init RData file", required = TRUE)
+parser$add_argument("--project", dest = "project",
+                    help = "Project name", required = TRUE)
+parser$add_argument("--matrix", dest = "dims_matrix",
+                    help = "The matrix used, e.g. Plasma, Research, ...", required = TRUE)
+parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer",
+                    help = "Numeric value, z-score = 1", required = TRUE)
+parser$add_argument("--sst_components_file", dest = "sst_components_file",
+                    help = "File path to the SST components file", required = TRUE)
+parser$add_argument("--export_scripts_dir", dest = "export_scripts_dir",
+                    help = "File path to the directory containing functions used in this script", required = TRUE)
+
+args <- parser$parse_args()
 
 outdir <- "./"
 
 # load in function scripts
-source(paste0(export_scripts_dir, "generate_qc_output_functions.R"))
+source(paste0(args$export_scripts_dir, "generate_qc_output_functions.R"))
 
 # load init files
-load(init_file)
+load(args$init_file)
 # load outlist from GenerateExcel
 load("outlist.RData")
 # load combined adducts for each scanmodus
@@ -38,7 +46,7 @@ dir.create(paste0(outdir, "/plots"), showWarnings = FALSE)
 control_label <- "C"
 
 #### CHECK NUMBER OF CONTROLS ####
-if (z_score == 1) {
+if (args$z_score == 1) {
   file_name <- "Check_number_of_controls.txt"
   min_num_controls <- 25
   check_number_of_controls(outlist, min_num_controls, file_name)
@@ -67,14 +75,14 @@ outlist_tot_neg <- outlist_tot_neg[, samples_both_modes]
 outlist_tot_pos <- outlist_tot_pos[, samples_both_modes]
 
 # Retrieve IS summed adducts
-is_summed <- get_internal_standards(is_list, "summed", repl_pattern, dims_matrix, rundate, project)
+is_summed <- get_internal_standards(is_list, "summed", repl_pattern, args$dims_matrix, rundate, args$project)
 # Retrieve IS positive mode
-is_pos <- get_internal_standards(is_list, "pos", outlist_tot_pos, dims_matrix, rundate, project)
+is_pos <- get_internal_standards(is_list, "pos", outlist_tot_pos, args$dims_matrix, rundate, args$project)
 # Retrieve IS negative mode
-is_neg <- get_internal_standards(is_list, "neg", outlist_tot_neg, dims_matrix, rundate, project)
+is_neg <- get_internal_standards(is_list, "neg", outlist_tot_neg, args$dims_matrix, rundate, args$project)
 
 # Save results
-save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", project, "_IS_results.RData"))
+save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", args$project, "_IS_results.RData"))
 
 # number of samples, for plotting length and width
 sample_count <- length(repl_pattern)
@@ -138,7 +146,7 @@ is_sum_selection <- c(
 )
 
 # add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum)
-if (dims_matrix == "DBS") {
+if (args$dims_matrix == "DBS") {
   add_min_intens_lines <- TRUE
   hline_data_neg <-
     data.frame(
@@ -155,7 +163,7 @@ if (dims_matrix == "DBS") {
       int_line = c(1300000, 2500000, 500000, 1800000, 1400000),
       HMDB_name = is_sum_selection
     )
-} else if (dims_matrix == "Plasma") {
+} else if (args$dims_matrix == "Plasma") {
   add_min_intens_lines <- TRUE
   hline_data_neg <-
     data.frame(
@@ -239,7 +247,7 @@ patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\
 positive_controls_index <- grepl(pattern = paste(patterns, collapse = "|"), column_list)
 positive_control_list <- column_list[positive_controls_index]
 
-if (z_score == 1) {
+if (args$z_score == 1) {
   # find if one or more positive control samples are missing
   pos_contr_warning <- c()
   if (all(sapply(c("^P1002", "^P1003", "^P1005"),
@@ -280,16 +288,16 @@ if (z_score == 1) {
 
     positive_control$Zscore <- as.numeric(positive_control$Zscore)
     # extra information added to excel for future reference. made in beginning of this script
-    positive_control$Matrix <- dims_matrix
+    positive_control$Matrix <- args$dims_matrix
     positive_control$Rundate <- rundate
-    positive_control$Project <- project
+    positive_control$Project <- args$project
 
     # Save results
-    save(positive_control, file = paste0(outdir, "/", project, "_positive_control.RData"))
+    save(positive_control, file = paste0(outdir, "/", args$project, "_positive_control.RData"))
     # round the Z-scores to 2 digits
     positive_control$Zscore <- round_df(positive_control$Zscore, 2)
     write.xlsx(positive_control,
-      file = paste0(outdir, "/", project, "_positive_control.xlsx"),
+      file = paste0(outdir, "/", args$project, "_positive_control.xlsx"),
       sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE
     )
   }
@@ -315,7 +323,7 @@ is_neg_intensities <- get_is_intensities(outlist_tot_neg, is_codes = is_codes)
 is_pos_intensities <- get_is_intensities(outlist_tot_pos, is_codes = is_codes)
 
 # SST components.
-sst_comp <- read.csv(sst_components_file, header = TRUE, sep = "\t")
+sst_comp <- read.csv(args$sst_components_file, header = TRUE, sep = "\t")
 sst_list <- outlist %>% filter(HMDB_code %in% sst_comp$HMDB_ID)
 sst_colnrs <- grep("P1001", colnames(sst_list))
 
@@ -352,7 +360,7 @@ setColWidths(wb, 3, cols = 1, widths = 24)
 addWorksheet(wb, "SST components")
 openxlsx::writeData(wb, sheet = 4, sst_list_intensities)
 setColWidths(wb, 4, cols = 1:3, widths = 24)
-xlsx_name <- paste0(outdir, "/", project, "_IS_SST.xlsx")
+xlsx_name <- paste0(outdir, "/", args$project, "_IS_SST.xlsx")
 openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE)
 rm(wb)
 
diff --git a/DIMS/GenerateQCOutput.nf b/DIMS/GenerateQCOutput.nf
index eb17301f..802e4f3d 100644
--- a/DIMS/GenerateQCOutput.nf
+++ b/DIMS/GenerateQCOutput.nf
@@ -20,11 +20,12 @@ process GenerateQCOutput {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateQCOutput.R $init_file \
-                                                                 $analysis_id \
-                                                                 $params.matrix \
-                                                                 $params.zscore \
-                                                                 $params.sst_components_file \
-                                                                 $params.export_scripts_dir
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateQCOutput.R \\
+            --init_file_path $init_file \\
+            --project $analysis_id \\
+            --matrix $params.matrix \\
+            --z_score $params.zscore \\
+            --sst_components_file $params.sst_components_file \\
+            --export_scripts_dir $params.export_scripts_dir
         """
 }

From c94c011a94ba7d16460d305d075bfb70effe0085 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 1 Sep 2025 16:26:07 +0200
Subject: [PATCH 08/28] Changed contaner to dims:1.4

---
 DIMS/CollectFilled.nf       | 2 +-
 DIMS/FillMissing.nf         | 2 +-
 DIMS/GenerateBreaks.nf      | 2 +-
 DIMS/GenerateExcel.nf       | 2 +-
 DIMS/GenerateQCOutput.nf    | 2 +-
 DIMS/GenerateViolinPlots.nf | 2 +-
 DIMS/HMDBparts.nf           | 2 +-
 DIMS/HMDBparts_main.nf      | 2 +-
 DIMS/MakeInit.nf            | 2 +-
 DIMS/PeakFinding.nf         | 2 +-
 DIMS/PeakGrouping.nf        | 2 +-
 DIMS/SpectrumPeakFinding.nf | 2 +-
 DIMS/SumAdducts.nf          | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf
index 53b19676..f556a8b4 100644
--- a/DIMS/CollectFilled.nf
+++ b/DIMS/CollectFilled.nf
@@ -1,7 +1,7 @@
 process CollectFilled {
     tag "DIMS CollectFilled"
     label 'CollectFilled'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf
index b0d18505..7678c95e 100644
--- a/DIMS/FillMissing.nf
+++ b/DIMS/FillMissing.nf
@@ -1,7 +1,7 @@
 process FillMissing {
     tag "DIMS FillMissing ${peakgrouplist_file}"
     label 'FillMissing'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
index af617a8d..290c4be1 100644
--- a/DIMS/GenerateBreaks.nf
+++ b/DIMS/GenerateBreaks.nf
@@ -1,7 +1,7 @@
 process GenerateBreaks {
     tag "DIMS GenerateBreaks"
     label 'GenerateBreaks'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
index bab44eb1..21fefd4a 100644
--- a/DIMS/GenerateExcel.nf
+++ b/DIMS/GenerateExcel.nf
@@ -1,7 +1,7 @@
 process GenerateExcel {
     tag "DIMS GenerateExcel"
     label 'GenerateExcel'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/GenerateQCOutput.nf b/DIMS/GenerateQCOutput.nf
index 802e4f3d..6e4c9940 100644
--- a/DIMS/GenerateQCOutput.nf
+++ b/DIMS/GenerateQCOutput.nf
@@ -1,7 +1,7 @@
 process GenerateQCOutput {
     tag "DIMS GenerateQCOutput"
     label 'GenerateQCOutput'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf
index 1c4b532d..6136ec3a 100755
--- a/DIMS/GenerateViolinPlots.nf
+++ b/DIMS/GenerateViolinPlots.nf
@@ -1,7 +1,7 @@
 process GenerateViolinPlots {
     tag "DIMS GenerateViolinPlots"
     label 'GenerateViolinPlots'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf
index 5f19f750..4b1fa5a6 100644
--- a/DIMS/HMDBparts.nf
+++ b/DIMS/HMDBparts.nf
@@ -1,7 +1,7 @@
 process HMDBparts {
     tag "DIMS HMDBparts"
     label 'HMDBparts'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf
index b38bac08..658baeb9 100644
--- a/DIMS/HMDBparts_main.nf
+++ b/DIMS/HMDBparts_main.nf
@@ -1,7 +1,7 @@
 process HMDBparts_main {
     tag "DIMS HMDBparts_main"
     label 'HMDBparts_main'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
index 7aae0e46..d323315e 100644
--- a/DIMS/MakeInit.nf
+++ b/DIMS/MakeInit.nf
@@ -1,7 +1,7 @@
 process MakeInit {
     tag "DIMS MakeInit"
     label 'MakeInit'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf
index 0097d128..0e25893c 100644
--- a/DIMS/PeakFinding.nf
+++ b/DIMS/PeakFinding.nf
@@ -1,7 +1,7 @@
 process PeakFinding {
     tag "DIMS PeakFinding ${rdata_file}"
     label 'PeakFinding'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf
index 0ecb18c1..7f3f17e4 100644
--- a/DIMS/PeakGrouping.nf
+++ b/DIMS/PeakGrouping.nf
@@ -1,7 +1,7 @@
 process PeakGrouping {
     tag "DIMS PeakGrouping ${hmdbpart_file}"
     label 'PeakGrouping'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf
index 35902658..a8ae5a88 100644
--- a/DIMS/SpectrumPeakFinding.nf
+++ b/DIMS/SpectrumPeakFinding.nf
@@ -1,7 +1,7 @@
 process SpectrumPeakFinding {
     tag "DIMS SpectrumPeakFinding"
     label 'SpectrumPeakFinding'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf
index 514567d4..524ef4a8 100644
--- a/DIMS/SumAdducts.nf
+++ b/DIMS/SumAdducts.nf
@@ -1,7 +1,7 @@
 process SumAdducts {
     tag "DIMS SumAdducts"
     label 'SumAdducts'
-    container = 'docker://umcugenbioinf/dims:1.3'
+    container = 'docker://umcugenbioinf/dims:1.4'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:

From 553da7083c850c6b339ada04d7049543d9b7b30b Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 24 Nov 2025 13:14:15 +0100
Subject: [PATCH 09/28] Added argparse to MakeInit

---
 DIMS/MakeInit.R  | 24 ++++++++++++++----------
 DIMS/MakeInit.nf |  4 +++-
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index 44d49962..d5f05702 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -1,14 +1,18 @@
-## adapted from makeInit in old pipeline
+# load required packages
+library("argparse")
 
-# define parameters
-args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "MakeInit")
 
-sample_sheet <- read.csv(args[1], sep = "\t")
-nr_replicates <- as.numeric(args[2])
+parser$add_argument("--sample_sheet", dest = "sample_sheet",
+                    help = "Samplesheet txt file", required = TRUE)
+parser$add_argument("--nr_replicates", dest = "nr_replicates", type = "integer",
+                    help = "Number of replicates, numeric value", required = TRUE)
 
-sample_names <- trimws(as.vector(unlist(sample_sheet[1])))
-nr_sample_groups <- length(sample_names) / nr_replicates
-group_names <- trimws(as.vector(unlist(sample_sheet[2])))
+args <- parser$parse_args()
+
+sample_names <- trimws(as.vector(unlist(args$sample_sheet[1])))
+nr_sample_groups <- length(sample_names) / args$nr_replicates
+group_names <- trimws(as.vector(unlist(args$sample_sheet[2])))
 group_names <- gsub("[^-.[:alnum:]]", "_", group_names)
 group_names_unique <- unique(group_names)
 
@@ -16,8 +20,8 @@ group_names_unique <- unique(group_names)
 repl_pattern <- c()
 for (sample_group in 1:nr_sample_groups) {
   tmp <- c()
-  for (repl in nr_replicates:1) {
-    index <- ((sample_group * nr_replicates) - repl) + 1
+  for (repl in args$nr_replicates:1) {
+    index <- ((sample_group * args$nr_replicates) - repl) + 1
     tmp <- c(tmp, sample_names[index])
   }
   repl_pattern <- c(repl_pattern, list(tmp))
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
index d323315e..793d8f0b 100644
--- a/DIMS/MakeInit.nf
+++ b/DIMS/MakeInit.nf
@@ -13,6 +13,8 @@ process MakeInit {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates 
+        Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R \\
+            --samplesheet $samplesheet \\
+            --nr_replicates $nr_replicates 
         """
 }

From dc3e80f83d6b3c8e0ebd29146965381333938776 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 24 Nov 2025 13:19:09 +0100
Subject: [PATCH 10/28] Added argparse to CollectFilled and fixed linting
 issues

---
 DIMS/CollectFilled.R  | 25 ++++++++++++++-----------
 DIMS/CollectFilled.nf |  4 +++-
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index 4bd408e0..265db5d3 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -1,14 +1,17 @@
-## adapted from 10-collectSamplesFilled.R
+# load package
+library("argparse")
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "CollectFilled")
 
-scripts_dir <- cmd_args[1]
-ppm <- as.numeric(cmd_args[2])
-z_score <- as.numeric(cmd_args[3])
+parser$add_argument("--scripts_dir", dest = "scripts_dir",
+                    help = "File path to the directory containing functions used in this script", required = TRUE)
+parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer",
+                    help = "Numeric value, z-score = 1", required = TRUE)
 
-source(paste0(scripts_dir, "merge_duplicate_rows.R"))
-source(paste0(scripts_dir, "calculate_zscores.R"))
+args <- parser$parse_args()
+
+source(paste0(args$scripts_dir, "merge_duplicate_rows.R"))
+source(paste0(args$scripts_dir, "calculate_zscores.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
@@ -17,7 +20,7 @@ for (scanmode in scanmodes) {
   filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled"))
   # load files and combine into one object
   outlist_total <- NULL
-  for (file_nr in 1:length(filled_files)) {
+  for (file_nr in seq_along(filled_files)) {
     peakgrouplist_filled <- get(load(filled_files[file_nr]))
     outlist_total <- rbind(outlist_total, peakgrouplist_filled)
   }
@@ -29,7 +32,7 @@ for (scanmode in scanmodes) {
   pattern_file <- paste0(scanmode, "_repl_pattern.RData")
   repl_pattern <- get(load(pattern_file))
   # calculate Z-scores
-  if (z_score == 1) {
+  if (args$z_score == 1) {
     outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE)
     nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
     order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
@@ -47,7 +50,7 @@ for (scanmode in scanmodes) {
     outlist_stats_more <- cbind(outlist_stats_more, tmp)
     outlist_total <- outlist_stats_more
   }
-  
+
   # make a copy of the outlist
   outlist_ident <- outlist_total
   # take care of NAs in theormz_noise
diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf
index f556a8b4..1f174448 100644
--- a/DIMS/CollectFilled.nf
+++ b/DIMS/CollectFilled.nf
@@ -14,6 +14,8 @@ process CollectFilled {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore
+        Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R \\
+            --scripts_dir $params.scripts_dir \\
+            --z_score $params.zscore
         """
 }

From 768517c76a8ee3f67858ebc848be7d2a112d62d1 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 24 Nov 2025 13:19:48 +0100
Subject: [PATCH 11/28] Added argparse to GenerateBreaks and fixed linting
 issues

---
 DIMS/GenerateBreaks.R  | 34 ++++++++++++++++++----------------
 DIMS/GenerateBreaks.nf |  5 ++++-
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index d07fd203..0943c327 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -1,15 +1,17 @@
-## adapted from 1-generateBreaksFwhm.HPC.R ##
-
-# load required package
+# load required packages
+load("argparse")
 suppressPackageStartupMessages(library("xcms"))
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "GenerateBreaks")
+
+parser$add_argument("--raw_file", dest = "raw_filepath",
+                    help = "File path to a raw file", required = TRUE)
+parser$add_argument("--trim", dest = "trim", type = "integer",
+                    help = "Trim precentage, numeric value", required = TRUE)
+parser$add_argument("--resolution", dest = "resolution", type = "integer",
+                    help = "Resolution of the MS machine, numeric value", required = TRUE)
 
-filepath <- cmd_args[1]
-outdir <- cmd_args[2]
-trim <- as.numeric(cmd_args[3])
-resol <- as.numeric(cmd_args[4])
+args <- parser$parse_args()
 
 # initialize
 trim_left_pos <- NULL
@@ -21,19 +23,19 @@ breaks_fwhm_avg <- NULL
 bins <- NULL
 
 # read in mzML file
-raw_data <- suppressMessages(xcms::xcmsRaw(filepath))
+raw_data <- suppressMessages(xcms::xcmsRaw(args$raw_filepath))
 
 # Get time values for positive and negative scans
 pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
 neg_times <- raw_data@scantime[raw_data@polarity == "negative"]
 
 # trim (remove) scans at the start and end for positive
-trim_left_pos  <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin
-trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind
+trim_left_pos  <- round(pos_times[length(pos_times) * (args$trim * 1.5)]) # 15% aan het begin
+trim_right_pos <- round(pos_times[length(pos_times) * (1 - (args$trim * 0.5))]) # 5% aan het eind
 
 # trim (remove) scans at the start and end for negative
-trim_left_neg  <- round(neg_times[length(neg_times) * trim])
-trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)])
+trim_left_neg  <- round(neg_times[length(neg_times) * args$trim])
+trim_right_neg <- round(neg_times[length(neg_times) * (1 - args$trim)])
 
 # Mass range m/z
 low_mz  <- raw_data@mzrange[1]
@@ -46,8 +48,8 @@ segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)
 # determine start and end of each bin.
 for (i in 1:nr_segments) {
   start_segment <- segment[i]
-  end_segment <- segment[i+1]
-  resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
+  end_segment <- segment[i + 1]
+  resol_mz <- args$resolution * (1 / sqrt(2) ^ (log2(start_segment / 200)))
   fwhm_segment <- start_segment / resol_mz
   breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
   # average the m/z instead of start value
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
index 290c4be1..ac0c2847 100644
--- a/DIMS/GenerateBreaks.nf
+++ b/DIMS/GenerateBreaks.nf
@@ -14,6 +14,9 @@ process GenerateBreaks {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution 
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R \\
+            --raw_file $mzML_file \\
+            --trim $params.trim \\
+            --resolution $params.resolution 
         """
 }

From a29c1f9e54439177851855f065644c5dafcb83f3 Mon Sep 17 00:00:00 2001
From: ALuesink <anneluesink@live.nl>
Date: Mon, 24 Nov 2025 13:23:18 +0100
Subject: [PATCH 12/28] Added argparse to HMDBparts_main and fixed linting
 issues

---
 DIMS/HMDBparts_main.R  | 36 ++++++++++++++++++++----------------
 DIMS/HMDBparts_main.nf |  4 +++-
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R
index 14335bf1..c1bb2185 100644
--- a/DIMS/HMDBparts_main.R
+++ b/DIMS/HMDBparts_main.R
@@ -1,34 +1,38 @@
-## adapted from hmdb_part_adductSums.R
+# load package
+load("argparse")
 
-# define parameters 
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "HMDBparts_main")
 
-db_file <- cmd_args[1]
-breaks_file <- cmd_args[2]
+parser$add_argument("--hmdb_db_file", dest = "hmdb_db_file",
+                    help = "File path the HMDB databse file", required = TRUE)
+parser$add_argument("--breaks_file", dest = "breaks_file",
+                    help = "File containing all the breaks/bins", required = TRUE)
 
-load(db_file)
-load(breaks_file)
+args <- parser$parse_args()
 
-# Cut up HMDB minus adducts minus isotopes into small parts 
+load(args$db_file)
+load(args$breaks_file)
+
+# Cut up HMDB minus adducts minus isotopes into small parts
 scanmodes <- c("positive", "negative")
 for (scanmode in scanmodes) {
   if (scanmode == "negative") {
     column_label <- "MNeg"
-    HMDB_add_iso <- HMDB_add_iso.Neg
+    hmdb_add_iso <- HMDB_add_iso.Neg
   } else if (scanmode == "positive") {
     column_label <- "Mpos"
-    HMDB_add_iso <- HMDB_add_iso.Pos
+    hmdb_add_iso <- HMDB_add_iso.Pos
   }
 
   # filter mass range measured
-  outlist <- HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] & 
-             HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
+  outlist <- hmdb_add_iso[which(hmdb_add_iso[, column_label] >= breaks_fwhm[1] &
+                                  hmdb_add_iso[, column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
 
   # remove adducts and isotopes, put internal standard at the beginning
   outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ]
   outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ]
   # sort on m/z value
-  outlist <- outlist[order(outlist[ , column_label]), ]
+  outlist <- outlist[order(outlist[, column_label]), ]
   nr_rows <- dim(outlist)[1]
 
   # size of hmdb parts in lines:
@@ -37,12 +41,12 @@ for (scanmode in scanmodes) {
   check <- 0
 
   # generate hmdb parts
-  if (nr_rows >= sub & (floor(nr_rows / sub)) >= 2) {
+  if (nr_rows >= sub && (floor(nr_rows / sub)) >= 2) {
     for (i in 1:floor(nr_rows / sub)) {
       start <- -(sub - 1) + i * sub
       end <- i * sub
       outlist_part <- outlist[c(start:end), ]
-      save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData"))
+      save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i, ".RData"))
     }
   }
 
@@ -53,4 +57,4 @@ for (scanmode in scanmodes) {
   outlist_part <- outlist[c(start:end), ]
   save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData"))
 
-}
\ No newline at end of file
+}
diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf
index 658baeb9..2ae3457d 100644
--- a/DIMS/HMDBparts_main.nf
+++ b/DIMS/HMDBparts_main.nf
@@ -14,6 +14,8 @@ process HMDBparts_main {
     script:
 
         """
-        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R $hmdb_db_file $breaks_file 
+        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R \\
+            --hmdb_db_file $hmdb_db_file \\
+            --breaks_file $breaks_file 
         """
 }

From d86f644d625089c8b13cb5466f4abc88a10d4660 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 3 Mar 2026 17:19:16 +0100
Subject: [PATCH 13/28] added argument parsing to DIMS/MakeInit step

---
 DIMS/MakeInit.R  | 20 +++++++++++---------
 DIMS/MakeInit.nf |  6 +++---
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index d5f05702..f896e9e6 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -3,28 +3,30 @@ library("argparse")
 
 parser <- ArgumentParser(description = "MakeInit")
 
-parser$add_argument("--sample_sheet", dest = "sample_sheet",
+parser$add_argument("--samplesheet", dest = "sample_sheet",
                     help = "Samplesheet txt file", required = TRUE)
 parser$add_argument("--nr_replicates", dest = "nr_replicates", type = "integer",
                     help = "Number of replicates, numeric value", required = TRUE)
 
 args <- parser$parse_args()
 
-sample_names <- trimws(as.vector(unlist(args$sample_sheet[1])))
-nr_sample_groups <- length(sample_names) / args$nr_replicates
-group_names <- trimws(as.vector(unlist(args$sample_sheet[2])))
+sample_sheet <- read.csv(args$sample_sheet, sep = "\t")
+sample_names <- trimws(as.vector(unlist(sample_sheet[1])))
+nr_replicates <- args$nr_replicates
+nr_sample_groups <- length(sample_names) / nr_replicates
+group_names <- trimws(as.vector(unlist(sample_sheet[2])))
 group_names <- gsub("[^-.[:alnum:]]", "_", group_names)
 group_names_unique <- unique(group_names)
 
 # generate the replication pattern
 repl_pattern <- c()
 for (sample_group in 1:nr_sample_groups) {
-  tmp <- c()
-  for (repl in args$nr_replicates:1) {
-    index <- ((sample_group * args$nr_replicates) - repl) + 1
-    tmp <- c(tmp, sample_names[index])
+  replicates_persample <- c()
+  for (repl in nr_replicates:1) {
+    index <- ((sample_group * nr_replicates) - repl) + 1
+    replicates_persample <- c(replicates_persample, sample_names[index])
   }
-  repl_pattern <- c(repl_pattern, list(tmp))
+  repl_pattern <- c(repl_pattern, list(replicates_persample))
 }
 
 names(repl_pattern) <- group_names_unique
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
index 793d8f0b..6c8b3f18 100644
--- a/DIMS/MakeInit.nf
+++ b/DIMS/MakeInit.nf
@@ -13,8 +13,8 @@ process MakeInit {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R \\
-            --samplesheet $samplesheet \\
-            --nr_replicates $nr_replicates 
+        Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R \
+                --samplesheet $samplesheet \
+                --nr_replicates $nr_replicates 
         """
 }

From 16b5e83d03966ce3d9e622763580a0597fb6ae07 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 3 Mar 2026 18:00:07 +0100
Subject: [PATCH 14/28] added argument parsing to DIMS/GenerateBreaks step

---
 DIMS/GenerateBreaks.R  | 43 ++++++++++++++++++++++++++----------------
 DIMS/GenerateBreaks.nf |  9 +++++----
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 0943c327..9e3838da 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -1,18 +1,24 @@
-# load required packages
-load("argparse")
+# load required package
 suppressPackageStartupMessages(library("xcms"))
+library("argparse")
 
+# define parameters
 parser <- ArgumentParser(description = "GenerateBreaks")
 
-parser$add_argument("--raw_file", dest = "raw_filepath",
-                    help = "File path to a raw file", required = TRUE)
-parser$add_argument("--trim", dest = "trim", type = "integer",
-                    help = "Trim precentage, numeric value", required = TRUE)
-parser$add_argument("--resolution", dest = "resolution", type = "integer",
-                    help = "Resolution of the MS machine, numeric value", required = TRUE)
+parser$add_argument("--mzML_filepath", dest = "mzML_filepath",
+                    help = "File path to the mzML file used", required = TRUE)
+parser$add_argument("--trim_param", dest = "trim_param",
+                    help = "Initial value of trim parameter (typically 0.1)", required = TRUE)
+parser$add_argument("--resolution", dest = "resolution",
+                    help = "Value for resolution (typically 140000)", required = TRUE)
 
 args <- parser$parse_args()
 
+trim <- as.numeric(args$trim_param)
+resol <- as.numeric(args$resolution)
+print(trim)
+print(resol)
+
 # initialize
 trim_left_pos <- NULL
 trim_right_pos <- NULL
@@ -23,23 +29,26 @@ breaks_fwhm_avg <- NULL
 bins <- NULL
 
 # read in mzML file
-raw_data <- suppressMessages(xcms::xcmsRaw(args$raw_filepath))
+print(args$mzML_filepath)
+raw_data <- suppressMessages(xcms::xcmsRaw(args$mzML_filepath))
 
 # Get time values for positive and negative scans
 pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
 neg_times <- raw_data@scantime[raw_data@polarity == "negative"]
+paint(pos_times)
 
 # trim (remove) scans at the start and end for positive
-trim_left_pos  <- round(pos_times[length(pos_times) * (args$trim * 1.5)]) # 15% aan het begin
-trim_right_pos <- round(pos_times[length(pos_times) * (1 - (args$trim * 0.5))]) # 5% aan het eind
+trim_left_pos  <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% at the beginning
+trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% at the end
 
 # trim (remove) scans at the start and end for negative
-trim_left_neg  <- round(neg_times[length(neg_times) * args$trim])
-trim_right_neg <- round(neg_times[length(neg_times) * (1 - args$trim)])
+trim_left_neg  <- round(neg_times[length(neg_times) * trim])
+trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)])
 
 # Mass range m/z
 low_mz  <- raw_data@mzrange[1]
 high_mz <- raw_data@mzrange[2]
+print(high_mz)
 
 # determine number of segments (bins)
 nr_segments <- 2 * (high_mz - low_mz)
@@ -47,9 +56,10 @@ segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)
 
 # determine start and end of each bin.
 for (i in 1:nr_segments) {
+	print(i)
   start_segment <- segment[i]
-  end_segment <- segment[i + 1]
-  resol_mz <- args$resolution * (1 / sqrt(2) ^ (log2(start_segment / 200)))
+  end_segment <- segment[i+1]
+  resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
   fwhm_segment <- start_segment / resol_mz
   breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
   # average the m/z instead of start value
@@ -59,5 +69,6 @@ for (i in 1:nr_segments) {
 }
 
 # generate output file
-save(breaks_fwhm, breaks_fwhm_avg, trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "breaks.fwhm.RData")
+save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData")
+save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData")
 save(high_mz, file = "highest_mz.RData")
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
index ac0c2847..72eb6af4 100644
--- a/DIMS/GenerateBreaks.nf
+++ b/DIMS/GenerateBreaks.nf
@@ -10,13 +10,14 @@ process GenerateBreaks {
 
     output:
        path('breaks.fwhm.RData'), emit: breaks
+       path('trim_params.RData'), emit: trim_params
        path('highest_mz.RData'), emit: highest_mz
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R \\
-            --raw_file $mzML_file \\
-            --trim $params.trim \\
-            --resolution $params.resolution 
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R \
+                --mzML_filepath $mzML_file \
+                --trim_param $params.trim \
+                --resolution $params.resolution 
         """
 }

From 26cb22257b76eedea156cb21af715d42e1dd7e17 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 3 Mar 2026 18:22:41 +0100
Subject: [PATCH 15/28] removed print statements

---
 DIMS/GenerateBreaks.R | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 9e3838da..95fc30c2 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -16,8 +16,6 @@ args <- parser$parse_args()
 
 trim <- as.numeric(args$trim_param)
 resol <- as.numeric(args$resolution)
-print(trim)
-print(resol)
 
 # initialize
 trim_left_pos <- NULL
@@ -29,13 +27,11 @@ breaks_fwhm_avg <- NULL
 bins <- NULL
 
 # read in mzML file
-print(args$mzML_filepath)
 raw_data <- suppressMessages(xcms::xcmsRaw(args$mzML_filepath))
 
 # Get time values for positive and negative scans
 pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
 neg_times <- raw_data@scantime[raw_data@polarity == "negative"]
-paint(pos_times)
 
 # trim (remove) scans at the start and end for positive
 trim_left_pos  <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% at the beginning
@@ -48,7 +44,6 @@ trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)])
 # Mass range m/z
 low_mz  <- raw_data@mzrange[1]
 high_mz <- raw_data@mzrange[2]
-print(high_mz)
 
 # determine number of segments (bins)
 nr_segments <- 2 * (high_mz - low_mz)
@@ -56,7 +51,6 @@ segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)
 
 # determine start and end of each bin.
 for (i in 1:nr_segments) {
-	print(i)
   start_segment <- segment[i]
   end_segment <- segment[i+1]
   resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))

From e8681e66cfdc96a3e158bc079eb636c1a04fdf7a Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 3 Mar 2026 18:55:18 +0100
Subject: [PATCH 16/28] added argument parsing for DIMS/HMDBparts_main step

---
 DIMS/HMDBparts_main.R  | 4 ++--
 DIMS/HMDBparts_main.nf | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R
index c1bb2185..67e3b2fd 100644
--- a/DIMS/HMDBparts_main.R
+++ b/DIMS/HMDBparts_main.R
@@ -1,5 +1,5 @@
 # load package
-load("argparse")
+library("argparse")
 
 parser <- ArgumentParser(description = "HMDBparts_main")
 
@@ -10,7 +10,7 @@ parser$add_argument("--breaks_file", dest = "breaks_file",
 
 args <- parser$parse_args()
 
-load(args$db_file)
+load(args$hmdb_db_file)
 load(args$breaks_file)
 
 # Cut up HMDB minus adducts minus isotopes into small parts
diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf
index 2ae3457d..a079753b 100644
--- a/DIMS/HMDBparts_main.nf
+++ b/DIMS/HMDBparts_main.nf
@@ -14,8 +14,8 @@ process HMDBparts_main {
     script:
 
         """
-        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R \\
-            --hmdb_db_file $hmdb_db_file \\
-            --breaks_file $breaks_file 
+        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R \
+                --hmdb_db_file $hmdb_db_file \
+                --breaks_file $breaks_file 
         """
 }

From c763e04e30786e3450ad7dd886ecab460fff4939 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 3 Mar 2026 19:09:53 +0100
Subject: [PATCH 17/28] added argument parsing for DIMS/HMDBparts step

---
 DIMS/HMDBparts.R  | 125 +++++++++++++++++-----------------------------
 DIMS/HMDBparts.nf |   6 ++-
 2 files changed, 52 insertions(+), 79 deletions(-)

diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R
index 232028e6..be9088f6 100755
--- a/DIMS/HMDBparts.R
+++ b/DIMS/HMDBparts.R
@@ -1,32 +1,54 @@
-# adapted from hmdb_parts.R
+# load package
+library("argparse")
 
 # define parameters 
-cmd_args <- commandArgs(trailingOnly = TRUE)
+parser <- ArgumentParser(description = "HMDBparts")
 
-db_file <- cmd_args[1] 
-breaks_file <- cmd_args[2] 
-standard_run <- cmd_args[4] 
+parser$add_argument("--hmdb_db_file", dest = "hmdb_db_file",
+                    help = "File path for the HMDB databse file", required = TRUE)
+parser$add_argument("--breaks_file", dest = "breaks_file",
+                    help = "File containing all the breaks for binning intensities", required = TRUE)
+parser$add_argument("--standard_run", dest = "standard_run",
+                    help = "Parameter indicating whether the dataset has m/z between 70 and 610", required = TRUE)
+parser$add_argument("--hmdb_parts_path", dest = "hmdb_parts_path",
+                    help = "Path to pre-generated HMDB files for standard run", required = TRUE)
+
+args <- parser$parse_args()
 
 # load file with binning breaks 
-load(breaks_file)
+load(args$breaks_file)
 min_mz <- round(breaks_fwhm[1])
 max_mz <- round(breaks_fwhm[length(breaks_fwhm)])
 
 # In case of a standard run use external HMDB parts
 # m/z is approximately 70 to 600: set limits between 68-71 for min and 599-610 for max
-if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) {
-  # skip generating HMDB parts
-  hmdb_parts_path <- cmd_args[3] 
-  # find all files containing hmdb in file name
-  hmdb_parts <- list.files(hmdb_parts_path, pattern = "hmdb") 
+if (args$standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) {
+  # skip generating HMDB parts; copy all files containing hmdb in file name from hmdb_parts_path
+  hmdb_parts <- list.files(args$hmdb_parts_path, pattern = "hmdb") 
   for (hmdb_file in hmdb_parts) {
-    file.copy(paste(hmdb_parts_path, hmdb_file, sep = "/"), "./", recursive = TRUE)
+    file.copy(paste(args$hmdb_parts_path, hmdb_file, sep = "/"), "./", recursive = TRUE)
   }
 } else { 
   # generate HMDB parts in case of non-standard mz range
-  load(db_file)
-  ppm <- as.numeric(cmd_args[5])
-
+  load(args$hmdb_db_file)
+
+  # determine segments of m/z for HMDB parts; smaller parts for m/z < 100
+  mz_segments <- c()
+  segment_start <- min_mz
+  segment_end <- min_mz + 5
+  while (segment_end < max_mz) {
+    if (segment_start < 100) {
+      segment_size = 5
+    } else {
+      segment_size = 10
+    }
+    mz_segments <- c(mz_segments, segment_start)
+    segment_start <- segment_start + segment_size
+    segment_end <- segment_end + segment_size
+  }
+  #last segment
+  mz_segments <- c(mz_segments, max_mz)
+  
   scanmodes <- c("positive", "negative")
   for (scanmode in scanmodes) {
     if (scanmode == "negative") {
@@ -38,73 +60,20 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz <
     }
 
     # filter mass range meassured
-    hmdb_add_iso = hmdb_add_iso[which(hmdb_add_iso[ , column_label] >= breaks_fwhm[1] &
-                                      hmdb_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
+    hmdb_add_iso = hmdb_add_iso[which(hmdb_add_iso[ , column_label] >= min_mz &
+                                      hmdb_add_iso[ , column_label] <= max_mz), ]
 
     # sort on mass
-    outlist <- hmdb_add_iso[order(as.numeric(hmdb_add_iso[ , column_label])),]
-    nr_rows <- dim(outlist)[1]
-
-    # maximum number of rows per file
-    sub <- 20000 
-    end <- 0
-    last_line <- sub
-    check <- 0
-    outlist_part <- NULL
+    sorted_hmdb_add_iso <- hmdb_add_iso[order(as.numeric(hmdb_add_iso[ , column_label])),]
+    nr_rows <- dim(sorted_hmdb_add_iso)[1]
 
     # create parts and save to file
-    if (nr_rows < sub) {
-      outlist_part <- outlist
-      save(outlist_part, file = paste0(scanmode, "_hmdb.1.RData"))
-    } else if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) {
-      for (i in 2:floor(nr_rows / sub) - 1) {
-        start <- -(sub - 1) + i * sub
-        end <- i * sub
-
-        if (i > 1){
-          outlist_i = outlist[c(start:end),]
-          nr_moved = 0
-          # Use ppm to replace border to avoid cut within peak group
-          while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / 
-                  as.numeric(outlist_i[1, column_label]) < ppm) {
-            outlist_part <- rbind(outlist_part, outlist_i[1, ])
-            outlist_i <- outlist_i[-1, ]
-            nr_moved <- nr_moved + 1
-          }
-
-          save(outlist_part, file = paste(scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = ""))
-          check <- check + dim(outlist_part)[1]
-
-          outlist_part <- outlist_i
-          last_line <- dim(outlist_part)[1]
-
-        } else {
-          outlist_part <- outlist[c(start:end),]
-        }
-      }
-
-      start <- end + 1
-      end <- nr_rows
-      outlist_i <- outlist[c(start:end), ]
-      nr_moved <- 0
-
-      if (!is.null(outlist_part)) {
-        # Calculate ppm and replace border, avoid cut within peak group
-        while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / 
-                as.numeric(outlist_i[1, column_label]) < ppm) {
-          outlist_part <- rbind(outlist_part, outlist_i[1, ])
-          outlist_i <- outlist_i[-1, ]
-          nr_moved <- nr_moved + 1
-        }
-
-        save(outlist_part, file = paste0(scanmode, "_hmdb.", i, ".RData"))
-        check <- check + dim(outlist_part)[1]
-      }
-
-      outlist_part <- outlist_i
-      save(outlist_part, file = paste0(scanmode, "_hmdb.", i + 1, ".RData"))
-      check <- check + dim(outlist_part)[1]
+    for (mz_part_index in 1:(length(mz_segments) - 1)) {
+      mz_start <- mz_segments[mz_part_index]
+      mz_end <- mz_segments[mz_part_index + 1]
+      outlist_part <- sorted_hmdb_add_iso[sorted_hmdb_add_iso[ , column_label] > mz_start &
+                                          sorted_hmdb_add_iso[ , column_label] <= mz_end, ]
+      save(outlist_part, file = paste0(scanmode, "_hmdb.", mz_part_index, ".RData"))
     }
   }
 }
-
diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf
index 4b1fa5a6..6c214915 100644
--- a/DIMS/HMDBparts.nf
+++ b/DIMS/HMDBparts.nf
@@ -13,6 +13,10 @@ process HMDBparts {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R \
+                --hmdb_db_file $hmdb_db_file \
+                --breaks_file $breaks_file \
+                --standard_run $params.standard_run \
+                --hmdb_parts_path $params.hmdb_parts_files
         """
 }

From fed4e3b2b8908d8240ddb4dcf64bfda27e93be43 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 5 Mar 2026 12:27:19 +0100
Subject: [PATCH 18/28] fixed loading trim parameters from separate file

---
 DIMS/AssignToBins.R  | 7 +++++--
 DIMS/AssignToBins.nf | 9 +++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index 7028b495..e02b8ceb 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -8,12 +8,15 @@ parser$add_argument("--mzML_filepath", dest = "mzml_filepath",
                     help = "File path for the mzML file", required = TRUE)
 parser$add_argument("--breaks_filepath", dest = "breaks_filepath",
                     help = "File path for the breaks RData file", required = TRUE)
+parser$add_argument("--trim_params_filepath", dest = "trim_params_filepath",
+                    help = "File path for the trim parameters", required = TRUE)
 
 args <- parser$parse_args()
 
-# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg,
-# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
+# load breaks_file: contains breaks_fwhm and breaks_fwhm_avg
 load(args$breaks_filepath)
+# load trim parameters trim_left_neg, trim_left_pos, trim_right_neg and trim_right_pos
+load(args$trim_params_filepath)
 
 # get sample name
 techrep_name <- sub("\\..*$", "", basename(args$mzml_filepath))
diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
index df3b089a..bcf2510e 100644
--- a/DIMS/AssignToBins.nf
+++ b/DIMS/AssignToBins.nf
@@ -5,7 +5,7 @@ process AssignToBins {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(val(file_id), path(mzML_file), path(breaks_file))
+       tuple(val(file_id), path(mzML_file), path(breaks_file), path(trim_params_file))
 
     output:
        path("${file_id}.RData"), emit: rdata_file
@@ -13,9 +13,10 @@ process AssignToBins {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R \\
-            --mzML_filepath $mzML_file \\
-            --breaks_filepath $breaks_file
+        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R \
+                --mzML_filepath $mzML_file \
+                --breaks_filepath $breaks_file \
+                --trim_params_filepath $trim_params_file
         """
 }
 

From ed7b5954359e6aa93feb8b5f7e3115dbc3bb5492 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 5 Mar 2026 13:27:01 +0100
Subject: [PATCH 19/28] added argument parsing to DIMS/EvaluateTics step

---
 DIMS/EvaluateTics.R                          | 158 +++++++++++++++++++
 DIMS/EvaluateTics.nf                         |  34 ++++
 DIMS/preprocessing/evaluate_tics_functions.R | 101 ++++++++++++
 3 files changed, 293 insertions(+)
 create mode 100644 DIMS/EvaluateTics.R
 create mode 100644 DIMS/EvaluateTics.nf
 create mode 100644 DIMS/preprocessing/evaluate_tics_functions.R

diff --git a/DIMS/EvaluateTics.R b/DIMS/EvaluateTics.R
new file mode 100644
index 00000000..9ed8cc78
--- /dev/null
+++ b/DIMS/EvaluateTics.R
@@ -0,0 +1,158 @@
+# load packages
+library("ggplot2")
+library("gridExtra")
+library("argparse")
+
+parser <- ArgumentParser(description = "EvaluateTics")
+
+parser$add_argument("--samplesheet_file", dest = "samplesheet_file",
+                    help = "Parsed samplesheet RData file", required = TRUE)
+parser$add_argument("--nr_replicates", dest = "nr_replicates",
+                    help = "Number of technical replicates per biological sample", required = TRUE)
+parser$add_argument("--analysis_id", dest = "analysis_id",
+                    help = "Name of the dataset", required = TRUE)
+parser$add_argument("--matrix", dest = "matrix",
+                    help = "Matrix used (e.g. Plasma, DBS, Research)", required = TRUE)
+parser$add_argument("--highest_mz_file", dest = "highest_mz_file",
+                    help = "File containing highest m/z value in dataset", required = TRUE)
+parser$add_argument("--trim_params_file", dest = "trim_params_file",
+                    help = "File containing values of trim parameters", required = TRUE)
+parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
+                    help = "File path to the directory containing functions used", required = TRUE)
+
+args <- parser$parse_args()
+
+nr_replicates <- as.numeric(args$nr_replicates)
+run_name <- args$analysis_id
+dims_matrix <- args$matrix
+
+# load functions
+source(paste0(args$preprocessing_scripts_dir, "evaluate_tics_functions.R"))
+
+# get the value for the highest m/z in the dataset
+highest_mz <- get(load(args$highest_mz_file))
+
+# load init_file: contains repl_pattern
+load(args$samplesheet_file)
+
+# load trim_params_file: contains trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
+load(args$trim_params_file)
+
+# set the threshold for accepting or rejecting technical replicates
+thresh2remove <- 1000000000
+# lower the threshold for non Plasma matrices
+if (dims_matrix != "Plasma") {
+  thresh2remove <- 1000000000
+}
+# lower the threshold for DBS or high m/z specific
+if (dims_matrix == "DBS") {
+  thresh2remove <- 500000000
+}
+if (highest_mz > 700) {
+  thresh2remove <- 1000000
+}
+
+# find out which technical replicates are below the threshold
+remove_tech_reps <- find_bad_replicates(repl_pattern, thresh2remove)
+print(remove_tech_reps)
+
+# negative scan mode
+remove_neg <- remove_tech_reps$neg
+repl_pattern_filtered <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
+save(repl_pattern_filtered, file = "negative_repl_pattern.RData")
+
+# positive scan mode
+remove_pos <- remove_tech_reps$pos
+repl_pattern_filtered <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
+save(repl_pattern_filtered, file = "positive_repl_pattern.RData")
+
+# get an overview of suitable technical replicates for both scan modes
+allsamples_techreps_neg <- get_overview_tech_reps(repl_pattern_filtered, "negative")
+allsamples_techreps_pos <- get_overview_tech_reps(repl_pattern_filtered, "positive")
+allsamples_techreps_both_scanmodes <- rbind(allsamples_techreps_pos, allsamples_techreps_neg)
+write.table(allsamples_techreps_both_scanmodes,
+            file = "replicates_per_sample.txt",
+            col.names = FALSE,
+            row.names = FALSE,
+            sep = ","
+)
+
+
+## generate TIC plots
+# get all txt files
+tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt")
+all_samps <- sub("_TIC\\..*$", "", basename(tic_files))
+
+# determine maximum intensity
+highest_tic_max <- 0
+for (file in tic_files) {
+  tic <- read.table(file)
+  this_tic_max <- max(tic$tic_intensity)
+  if (this_tic_max > highest_tic_max) {
+    highest_tic_max <- this_tic_max
+    max_sample <- sub("_TIC\\..*$", "", basename(file))
+  }
+}
+
+# create a list with information for all TIC plots
+tic_plot_list <- list()
+plot_nr <-  0
+for (sample_nr in c(1:length(repl_pattern))) {
+  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
+  sample_name <- names(repl_pattern)[sample_nr]
+  for (file_nr in 1:length(tech_reps)) {
+    plot_nr <- plot_nr + 1
+    # read file with retention time, intensity and dims_threshold values
+    repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt"))
+    # get threshold values per technical replicate
+    dims_thresh_pos <- repl1_nr[1, "threshold"]
+    dims_thresh_neg <- repl1_nr[nrow(repl1_nr), "threshold"]
+    # for replicates with bad TIC, determine what color the border of the plot should be
+    bad_color_pos <- tech_reps[file_nr] %in% remove_pos
+    bad_color_neg <- tech_reps[file_nr] %in% remove_neg
+    if (bad_color_neg & bad_color_pos) {
+      plot_color <- "#F8766D"
+    } else if (bad_color_pos) {
+      plot_color <- "#ED8141"
+    } else if (bad_color_neg) {
+      plot_color <- "#BF80FF"
+    } else {
+      plot_color <- "white"
+    }
+    tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) +
+      geom_line(linewidth = 0.3) +
+      geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) +
+      geom_vline(xintercept = trim_left_pos, col = "red", linetype = 2, linewidth = 0.3) +
+      geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) +
+      geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) +
+      geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) +
+      geom_segment(x = trim_left_pos, y = dims_thresh_pos, xend = trim_right_pos, yend = dims_thresh_pos, colour = "green", lty = 2) + 
+      geom_segment(x = trim_left_neg, y = dims_thresh_neg, xend = trim_right_neg, yend = dims_thresh_neg, colour = "blue", lty = 2) + 
+      labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], "  ||  ", sample_name)) +
+      theme(plot.background = element_rect(fill = plot_color),
+            axis.text = element_text(size = 4),
+            axis.title = element_text(size = 4),
+            plot.title = element_text(size = 6))
+    tic_plot_list[[plot_nr]] <- tic_plot
+  }
+}
+
+# create a layout matrix dependent on number of replicates
+layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE)
+# put TIC plots in matrix
+tic_plot_pdf <- marrangeGrob(
+  grobs = tic_plot_list,
+  nrow = 10, ncol = nr_replicates,
+  layout_matrix = layout,
+  top = quote(paste(
+    "TICs of run", run_name,
+    " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ",
+    g, "/", npages
+  ))
+)
+
+# save to file
+ggsave(filename = paste0(run_name, "_TICplots.pdf"),
+       tic_plot_pdf, width = 21, height = 29.7, units = "cm")
+
+
diff --git a/DIMS/EvaluateTics.nf b/DIMS/EvaluateTics.nf
new file mode 100644
index 00000000..3f91a13e
--- /dev/null
+++ b/DIMS/EvaluateTics.nf
@@ -0,0 +1,34 @@
+process EvaluateTics {
+    tag "DIMS EvaluateTics"
+    label 'EvaluateTics'
+    container = 'docker://umcugenbioinf/dims:1.4'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(rdata_file)
+       path(tic_txt_files)
+       path(init_file)
+       val(analysis_id)
+       path(highest_mz_file)
+       path(trim_params_file)
+
+    output:
+       path('*_repl_pattern.RData'),      emit: pattern_files
+       path('replicates_per_sample.txt'), emit: sample_techreps
+       path('miss_infusions_negative.txt')
+       path('miss_infusions_positive.txt')
+       path('*_TICplots.pdf'),            emit: tic_plots_pdf
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/EvaluateTics.R \
+                --samplesheet_file $init_file \
+                --nr_replicates $params.nr_replicates \
+                --analysis_id $analysis_id \
+                --matrix $params.matrix \
+                --highest_mz_file $highest_mz_file \
+                --trim_params_file $trim_params_file \
+                --preprocessing_scripts_dir $params.preprocessing_scripts_dir
+        """
+}
+
diff --git a/DIMS/preprocessing/evaluate_tics_functions.R b/DIMS/preprocessing/evaluate_tics_functions.R
new file mode 100644
index 00000000..df0dbcf4
--- /dev/null
+++ b/DIMS/preprocessing/evaluate_tics_functions.R
@@ -0,0 +1,101 @@
+# EvaluateTics functions
+find_bad_replicates <- function(repl_pattern, thresh2remove) {
+  #' Find technical replicates with a total intensity below a threshold
+  #'
+  #' @param repl_pattern: List of samples with corresponding technical replicates (strings)
+  #' @param thresh2remove: Threshold value for acceptance or rejection of total intensity (integer)
+  #'
+  #' @return remove_tech_reps: Array of rejected technical replicates (strings)
+  
+  remove_pos <- NULL
+  remove_neg <- NULL
+  cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
+  for (sample_nr in 1:length(repl_pattern)) {
+    tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
+    for (file_nr in 1:length(tech_reps)) {
+      load(paste0(tech_reps[file_nr], ".RData"))
+      cat("\n\nParsing", tech_reps[file_nr])
+      # positive scan mode: determine whether sum of intensities is above threshold
+      cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1]))
+      if (sum(peak_list$pos[, 1]) < thresh2remove) {
+        cat(" ... Removed")
+        remove_pos <- c(remove_pos, tech_reps[file_nr])
+      }
+      # negative scan mode: determine whether sum of intensities is above threshold
+      cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1]))
+      if (sum(peak_list$neg[, 1]) < thresh2remove) {
+        cat(" ... Removed")
+        remove_neg <- c(remove_neg, tech_reps[file_nr])
+      }
+    }
+  }
+  cat("\n")
+  # write information on miss_infusions for both scan modes
+  write.table(remove_pos,
+              file = paste0("miss_infusions_positive.txt"),
+              row.names = FALSE,
+              col.names = FALSE,
+              sep = "\t"
+  )
+  write.table(remove_neg,
+              file = paste0("miss_infusions_negative.txt"),
+              row.names = FALSE,
+              col.names = FALSE,
+              sep = "\t"
+  )
+  
+  # combine removed technical replicates from pos and neg
+  remove_tech_reps <- list(pos = remove_pos, neg = remove_neg)
+  return(remove_tech_reps)
+}
+
+remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
+  #' Remove technical replicates with insufficient quality from a biological sample
+  #'
+  #' @param bad_samples: Array of technical replicates of insufficient quality (strings)
+  #' @param repl_pattern: List of samples with corresponding technical replicates (strings)
+  #' @param nr_replicates: Number of technical replicates per biological sample (integer)
+  #'
+  #' @return repl_pattern_filtered: list of technical replicates of sufficient quality (strings)
+  
+  # collect list of samples to remove from replication pattern
+  remove_from_group <- NULL
+  for (sample_nr in 1:length(repl_pattern)){
+    repl_pattern_1sample <- repl_pattern[[sample_nr]]
+    remove <- NULL
+    for (file_nr in 1:length(repl_pattern_1sample)) {
+      if (repl_pattern_1sample[file_nr] %in% bad_samples) {
+        remove <- c(remove, file_nr)
+      }
+    }
+    if (length(remove) == nr_replicates) {
+      remove_from_group <- c(remove_from_group, sample_nr)
+    }
+    if (!is.null(remove)) {
+      repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove]
+    }
+  }
+  if (length(remove_from_group) != 0) {
+    repl_pattern_filtered <- repl_pattern[-remove_from_group]
+  } else {
+    repl_pattern_filtered <- repl_pattern
+  }
+  return(repl_pattern_filtered)
+}
+
+get_overview_tech_reps <- function(repl_pattern_filtered, scanmode) {
+  #' Create an overview of technical replicates with sufficient quality from a biological sample
+  #'
+  #' @param repl_pattern_filtered: List of samples with corresponding technical replicates (strings)
+  #' @param scanmode: Scan mode "positive" or "negative" (string)
+  #'
+  #' @return allsamples_techreps_scanmode: Matrix of technical replicates of sufficient quality (strings)
+  
+  allsamples_techreps_scanmode <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered))
+  for (sample_nr in 1:length(repl_pattern_filtered)) {
+    allsamples_techreps_scanmode[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr]
+    allsamples_techreps_scanmode[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";")
+  }
+  allsamples_techreps_scanmode[, 3] <- scanmode
+  return(allsamples_techreps_scanmode)
+}

From f7ef463e5b3fc2ba79f3275e1fce78700fff2b83 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 5 Mar 2026 16:25:14 +0100
Subject: [PATCH 20/28] added argument parsing for new PeakFinding step

---
 DIMS/PeakFinding.R                          | 108 +++++----
 DIMS/PeakFinding.nf                         |  12 +-
 DIMS/preprocessing/peak_finding_functions.R | 245 ++++++++++++++++++++
 3 files changed, 316 insertions(+), 49 deletions(-)
 create mode 100644 DIMS/preprocessing/peak_finding_functions.R

diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index 697f8f7e..99bfafc5 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -1,53 +1,69 @@
-## adapted from 4-peakFinding.R
+# load packages
+library(dplyr)
+library("argparse")
+
+parser <- ArgumentParser(description = "PeakFinding")
+
+parser$add_argument("--rdata_file", dest = "rdata_file",
+                    help = "RData file containing binned intenstity per technical replicate", required = TRUE)
+parser$add_argument("--samplesheet", dest = "techreps_scanmode",
+                    help = "Samplesheet containing information on technical replicates and biological samples", required = TRUE)
+parser$add_argument("--resolution", dest = "resol",
+                    help = "Value for resolution (typically 140000)", required = TRUE)
+parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
+                    help = "File path to the directory containing functions used", required = TRUE)
+
+args <- parser$parse_args()
 
 # define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
-
-sample_file <- cmd_args[1]
-breaks_file <- cmd_args[2]
-resol <- as.numeric(cmd_args[3])
-scripts_dir <- cmd_args[4]
-thresh <- 2000
-outdir <- "./"
-
-# load in function scripts
-source(paste0(scripts_dir, "do_peakfinding.R"))
-source(paste0(scripts_dir, "check_overlap.R"))
-source(paste0(scripts_dir, "search_mzrange.R"))
-source(paste0(scripts_dir, "fit_optim.R"))
-source(paste0(scripts_dir, "fit_gaussian.R"))
-source(paste0(scripts_dir, "fit_init.R"))
-source(paste0(scripts_dir, "get_fwhm.R"))
-source(paste0(scripts_dir, "get_stdev.R"))
-source(paste0(scripts_dir, "optimize_gaussfit.R"))
-source(paste0(scripts_dir, "fit_peaks.R"))
-source(paste0(scripts_dir, "fit_gaussians.R"))
-source(paste0(scripts_dir, "estimate_area.R"))
-source(paste0(scripts_dir, "get_fit_quality.R"))
-source(paste0(scripts_dir, "check_overlap.R"))
-source(paste0(scripts_dir, "sum_curves.R"))
-source(paste0(scripts_dir, "within_ppm.R"))
-
-load(breaks_file)
-
-# Load output of AverageTechReplicates for a sample
-sample_avgtechrepl <- get(load(sample_file))
-if (grepl("_pos", sample_file)) {
-  scanmode <- "positive"
-} else if (grepl("_neg", sample_file)) {
-  scanmode <- "negative"
-}
+resol <- as.numeric(args$resolution)
+# use fixed theshold between noise and signal for peak
+peak_thresh <- 2000 
+
+# source functions script
+source(paste0(args$preprocessing_scripts_dir, "peak_finding_functions.R"))
+
+# Load output of AssignToBins (peak_list) for a technical replicate
+load(args$rdata_file)
+techrepl_name <- colnames(peak_list$pos)[1]
+
+# load list of technical replicates per sample that passed threshold filter
+techreps_passed <- read.table("replicates_per_sample.txt", sep = ",")
 
 # Initialize
 options(digits = 16)
-# Number used to calculate area under Gaussian curve
-int_factor <- 1 * 10^5 
-# Initial value used to estimate scaling parameter
-scale <- 2
-width <- 1024
-height <- 768
 
-# run the findPeaks function
+# do peak finding
+scanmodes <- c("positive", "negative")
+for (scanmode in scanmodes) {
+  # get intensities for scan mode
+  if (scanmode == "positive") {
+    ints_perscanmode <- peak_list$pos
+  } else if (scanmode == "negative") {
+    ints_perscanmode <- peak_list$neg
+  }
+ 
+  # check whether technical replicate has passed threshold filter for this scanmode
+  techreps_scanmode <- techreps_passed[grep(scanmode, techreps_passed[, 3]), ]
+  # if techrep is ok, it will be found. If not, skip this techrep.
+  if (length(grep(techrepl_name, techreps_scanmode)) == 0) {
+    break
+  }
 
-# do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
-do_peakfinding(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
+  # put mz and intensities into dataframe
+  ints_fullrange <- as.data.frame(cbind(mz = as.numeric(rownames(ints_perscanmode)), 
+                                        int = as.numeric(ints_perscanmode)))
+
+  # look for m/z range for all peaks
+  regions_of_interest <- search_regions_of_interest(ints_fullrange)
+ 
+  # fit Gaussian curve and calculate integrated area under curve
+  integrated_peak_df <- integrate_peaks(ints_fullrange, regions_of_interest, resol, peak_thresh)
+ 
+  # add sample name to dataframe
+  integrated_peak_df <- as.data.frame(cbind(samplenr = techrepl_name, integrated_peak_df))
+ 
+  # save output to file
+  save(integrated_peak_df, file = paste0(techrepl_name, "_", scanmode, ".RData"))
+  write.table(integrated_peak_df, file = paste0(techrepl_name, "_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
+}
diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf
index 0e25893c..9eb283f1 100644
--- a/DIMS/PeakFinding.nf
+++ b/DIMS/PeakFinding.nf
@@ -5,13 +5,19 @@ process PeakFinding {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(path(rdata_file), path(breaks_file))
+       path(rdata_file)
+       each path(sample_techreps)
 
     output:
-       path '*tive.RData'
+       path '*tive.RData', emit: peaklist_rdata, optional: true
+       path '*tive.txt', optional: true
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.scripts_dir
+        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R \
+                --rdata_file $rdata_file \
+                --samplesheet $sample_techreps \
+                --resolution $params.resolution \
+                --preprocessing_scripts_dir $params.preprocessing_scripts_dir
         """
 }
diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R
new file mode 100644
index 00000000..6ad389a3
--- /dev/null
+++ b/DIMS/preprocessing/peak_finding_functions.R
@@ -0,0 +1,245 @@
+# functions for peak finding
+
+search_regions_of_interest <- function(ints_fullrange) {
+  #' Divide the full m/z range into regions of interest with indices
+  #'
+  #' @param ints_fullrange: Matrix with m/z values and intensities (float)
+  #'
+  #' @return regions_of_interest: matrix of m/z regions of interest (integer)
+
+  # find indices where intensity is not equal to zero
+  nonzero_positions <- as.vector(which(ints_fullrange$int != 0))
+
+  # find regions of interest (look for consecutive numbers)
+  regions_of_interest_consec <- seqToIntervals(nonzero_positions)
+  # add length of regions of interest
+  regions_of_interest_diff <- regions_of_interest_consec[, 2] - regions_of_interest_consec[, 1] + 1
+  regions_of_interest_length <- cbind(regions_of_interest_consec, length = regions_of_interest_diff)
+  # remove short lengths; a peak should have at least 3 data points
+  if (any(regions_of_interest_length[, "length"] < 3)) {
+    regions_of_interest_gte3 <- regions_of_interest_length[-which(regions_of_interest_length[, "length"] < 3), ]
+  } else {
+    regions_of_interest_gte3 <- regions_of_interest_length
+  }
+  # test for length of roi (region of interest). If length is greater than 11, break up into separate rois
+  remove_roi_index <- c()
+  new_rois_all <- regions_of_interest_gte3[0, ]
+  for (roi_nr in 1:nrow(regions_of_interest_gte3)) {
+    if (regions_of_interest_gte3[roi_nr, "length"] > 11) {
+      roi <- ints_fullrange[(regions_of_interest_gte3[roi_nr, "from"]:regions_of_interest_gte3[roi_nr, "to"]), ]
+      roi_intrange <- as.numeric(roi$int)
+      # look for local minima that separate the peaks
+      local_min_positions <- which(diff(sign(diff(roi_intrange))) == 2) + 1
+      if (length(local_min_positions) > 0) {
+        remove_roi_index <- c(remove_roi_index, roi_nr)
+        # find new indices for rois after splitting
+        start_pos <- regions_of_interest_gte3[roi_nr, "from"]
+        new_rois <- data.frame(from = 0, to = 0, length = 0)
+        new_rois_splitroi <- regions_of_interest_gte3[0, ]
+        for (local_min_index in 1:length(local_min_positions)) {
+          new_rois[, 1] <- start_pos
+          new_rois[, 2] <- start_pos + local_min_positions[local_min_index]
+          new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
+          new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
+          start_pos <- new_rois[, 2]
+        }
+        # intensities after last local minimum
+        new_rois[, 1] <- start_pos
+        new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"]
+        new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
+        new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
+        # append
+        new_rois_all <- rbind(new_rois_all, new_rois_splitroi)
+      } else {
+        # if there are no local minima, all intensities belong to one peak.
+        remove_roi_index <- c(remove_roi_index, roi_nr)
+        # look for maximum and take 5 intensities to the left and right
+        max_index <- which(roi_intrange == max(roi_intrange))
+        max_pos <- as.numeric(rownames(roi)[max_index])
+        new_roi <- data.frame(from = 0, to = 0, length = 0)
+        new_roi[, 1] <- max_pos - 5
+        new_roi[, 2] <- max_pos + 5
+        new_roi[, 3] <- new_roi[, 2] - new_roi[, 1] + 1
+        # append
+        new_rois_all <- rbind(new_rois_all, new_roi)
+      }
+    }
+  }
+
+  # remove rois that have been split into chunks or shortened
+  if (length(remove_roi_index) > 0) {
+    regions_of_interest_minus_short <- regions_of_interest_gte3[-remove_roi_index, ]
+  } else {
+    regions_of_interest_minus_short <- regions_of_interest_gte3
+  }
+  # combine remaining rois with info on chunks
+  regions_of_interest_split <- rbind(regions_of_interest_minus_short, new_rois_all)
+  # remove regions of interest with short lengths again
+  if (any(regions_of_interest_split[, "length"] < 3)) {
+    regions_of_interest_final <- regions_of_interest_split[-which(regions_of_interest_split[, "length"] < 3), ]
+  } else {
+    regions_of_interest_final <- regions_of_interest_split
+  }
+
+  # sort on first index
+  if (nrow(regions_of_interest_final) > 1){
+    regions_of_interest_sorted <- regions_of_interest_final %>% as.data.frame %>% dplyr::arrange(from)
+  } else {
+    regions_of_interest_sorted <- regions_of_interest_final
+  }
+
+  return(regions_of_interest_sorted)
+}
+
+integrate_peaks <- function(ints_fullrange, regions_of_interest, resol, peak_thresh) {
+  #' Fit Gaussian peak for each region of interest and integrate area under the curve
+  #'
+  #' @param ints_fullrange: Named list of intensities (float)
+  #' @param regions_of_interest: Named list of intensities (float)
+  #' @param resol: Value for resolution (integer)
+  #' @param peak_thresh: Value for noise level threshold (integer) # NOT USED YET
+  #'
+  #' @return allpeaks_values: matrix of integrated peaks
+
+  # initialize dataframe to store results for all peaks
+  allpeaks_values <- matrix(0, nrow = nrow(regions_of_interest), ncol = 5)
+  colnames(allpeaks_values) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+
+  for (roi_nr in 1:nrow(regions_of_interest)) {
+    # find m/z values and intensities corresponding to the region of interest
+    index_range <- regions_of_interest[roi_nr, "from"] : regions_of_interest[roi_nr, "to"]
+    roi_mzrange <- as.numeric(ints_fullrange$mz[index_range])
+    roi_intrange <- as.numeric(ints_fullrange$int[index_range])
+    # find m/z value for maximum of peak (mu in Gaussian function)
+    weighted_mzmean <- weighted.mean(roi_mzrange, roi_intrange)
+    # find expected peak width at this m/z value
+    fwhm_mzmed <- get_fwhm(weighted_mzmean, resol)
+    # calculate sigma for Gaussian curve (https://en.wikipedia.org/wiki/Full_width_at_half_maximum)
+    sigma_peak <- fwhm_mzmed / 2.355
+    # find scale factor for Gaussian. Initial estimate is maximum intensity in roi
+    # divided by 2 for better correlation with intensities from old PeakFinding method
+    scale_factor <- max(roi_intrange) / 2
+    # fit Gaussian peak. Find intensities according to normal distribution
+    normal_ints <- scale_factor * gaussfunc(roi_mzrange, weighted_mzmean, sigma_peak)
+    # sum intensities in roi
+    sum_ints <- sum(roi_intrange)
+    sum_gauss <- sum(normal_ints)
+    # calculate quality of fit
+    quality_fit <- 1 - sum(abs(normal_ints - roi_intrange)) / sum_ints
+    # put all values into dataframe
+    allpeaks_values[roi_nr, "mzmed.pkt"] <- weighted_mzmean
+    allpeaks_values[roi_nr, "fq"] <- quality_fit
+    allpeaks_values[roi_nr, "mzmax.pkt"] <- max(roi_mzrange)
+    allpeaks_values[roi_nr, "mzmin.pkt"] <- min(roi_mzrange)
+    allpeaks_values[roi_nr, "height.pkt"] <- sum_gauss
+  }
+
+  # remove peaks with height = 0, look for NA or NaN
+  remove_na <- which(is.na(as.numeric(allpeaks_values[, "height.pkt"])))
+  if (length(remove_na) > 0) {
+    allpeaks_values <- allpeaks_values[-remove_na, ]
+  }
+
+  return(allpeaks_values)
+}
+
+# in the next version of the docker image, package R.utils will be included
+seqToIntervals <- function(idx) {
+  #' Find consecutive stretches of numbers
+  #' function seqToIntervals copied from R.utils library
+  #' see https://rdrr.io/cran/R.utils/src/R/seqToIntervals.R
+  #'
+  #' @param idx: Sequence of indices (integers)
+  #'
+  #' @return res: Matrix of start and end positions of consecutive numbers (matrix)
+
+  # Clean up sequence
+  idx <- as.integer(idx)
+  idx <- unique(idx)
+  idx <- sort(idx)
+
+  len_idx <- length(idx)
+  if (len_idx == 0L) {
+    res <- matrix(NA_integer_, nrow = 0L, ncol = 2L)
+    colnames(res) <- c("from", "to")
+    return(res)
+  }
+
+  # Identify end points of intervals
+  diff_idx <- diff(idx)
+  diff_idx <- (diff_idx > 1)
+  diff_idx <- which(diff_idx)
+  nr_intervals <- length(diff_idx) + 1
+
+  # Allocate return matrix
+  res <- matrix(NA_integer_, nrow = nr_intervals, ncol = 2L)
+  colnames(res) <- c("from", "to")
+
+  from_value <- idx[1]
+  to_value <- from_value - 1
+  last_value <- from_value
+
+  count <- 1
+  for (running_index in seq_along(idx)) {
+    value <- idx[running_index]
+    if (value - last_value > 1) {
+      to_value <- last_value
+      res[count, ] <- c(from_value, to_value)
+      from_value <- value
+      count <- count + 1
+    }
+    last_value <- value
+  }
+
+  if (to_value < from_value) {
+    to_value <- last_value
+    res[count, ] <- c(from_value, to_value)
+  }
+
+  return(res)
+}
+
+get_fwhm <- function(query_mass, resol) {
+  #' Calculate fwhm (full width at half maximum intensity) for a peak
+  #'
+  #' @param query_mass: Value for mass (float)
+  #' @param resol: Value for resolution (integer)
+  #'
+  #' @return fwhm: Value for full width at half maximum (float)
+
+  # set aberrant values of query_mass to default value of 200
+  if (is.na(query_mass)) {
+    query_mass <- 200
+  }
+  if (query_mass < 0) {
+    query_mass <- 200
+  }
+  # calculate resolution at given m/z value
+  resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200)))
+  # calculate full width at half maximum
+  fwhm <- query_mass / resol_mz
+
+  return(fwhm)
+}
+
+
+# from https://rdrr.io/cran/rvmethod/src/R/gaussfit.R
+#' Gaussian Function from package rvmethod
+#'
+#' This function returns the unnormalized (height of 1.0) Gaussian curve with a
+#' given center and spread.
+#'
+#' @param x the vector of values at which to evaluate the Gaussian
+#' @param mu the center of the Gaussian
+#' @param sigma the spread of the Gaussian (must be greater than 0)
+#' @return vector of values of the Gaussian
+#' @examples x = seq(-4, 4, length.out = 100)
+#' y = gaussfunc(x, 0, 1)
+#' plot(x, y)
+#'
+#' @import stats
+#'
+#' @export
+gaussfunc <- function(x, mu, sigma) {
+  return(exp(-((x - mu) ^ 2) / (2 * (sigma ^ 2))))
+}

From 9915f55eee27d10928f4f8aed524cf0cb6ac8314 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 5 Mar 2026 18:00:36 +0100
Subject: [PATCH 21/28] added argument parsing to DIMS/AveragePeaks step

---
 DIMS/AveragePeaks.R                          | 46 ++++++++++++++++++++
 DIMS/AveragePeaks.nf                         | 22 ++++++++++
 DIMS/preprocessing/average_peaks_functions.R | 41 +++++++++++++++++
 3 files changed, 109 insertions(+)
 create mode 100644 DIMS/AveragePeaks.R
 create mode 100644 DIMS/AveragePeaks.nf
 create mode 100644 DIMS/preprocessing/average_peaks_functions.R

diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R
new file mode 100644
index 00000000..bdfc6cde
--- /dev/null
+++ b/DIMS/AveragePeaks.R
@@ -0,0 +1,46 @@
+# load required packages
+library("dplyr")
+library("argparse")
+
+parser <- ArgumentParser(description = "AveragePeaks")
+
+parser$add_argument("--sample_id", dest = "sample_name",
+                    help = "Name of a biological sample", required = TRUE)
+parser$add_argument("--tech_reps", dest = "tech_reps",
+                    help = "Names of the technical replicates belonging to the biological sample", required = TRUE)
+parser$add_argument("--scanmode", dest = "scanmode",
+                    help = "Scan mode (either posiive or negative)", required = TRUE)
+parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
+                    help = "File path to the directory containing functions used", required = TRUE)
+
+args <- parser$parse_args()
+
+# define parameters
+sample_name <- args$sample_id
+tech_reps <- strsplit(args$tech_reps, ";")[[1]]
+
+# load in function scripts
+source(paste0(args$preprocessing_scripts_dir, "average_peaks_functions.R"))
+
+# Initialize per sample
+peaklist_allrepl <- NULL
+nr_repl_persample <- 0
+averaged_peaks <- matrix(0, nrow = 0, ncol = 6) 
+colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+
+# load RData files of technical replicates belonging to biological sample
+for (file_nr in 1:length(tech_reps)) {
+  tech_repl_file <- paste0(tech_reps[file_nr], "_", args$scanmode, ".RData")
+  tech_repl <- get(load(tech_repl_file))
+  # combine data for all technical replicates
+  peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl)
+}
+# sort on mass
+peaklist_allrepl_df <- as.data.frame(peaklist_allrepl)
+peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) 
+peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) 
+peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt)
+
+# average over technical replicates
+averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, sample_name)
+save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", args$scanmode, ".RData"))
diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf
new file mode 100644
index 00000000..65c136ec
--- /dev/null
+++ b/DIMS/AveragePeaks.nf
@@ -0,0 +1,22 @@
+process AveragePeaks {
+    tag "DIMS AveragePeaks"
+    label 'AveragePeaks'
+    container = 'docker://umcugenbioinf/dims:1.4'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(rdata_files)
+       tuple val(sample_id), val(tech_reps), val(scanmode)
+
+    output:
+       path 'AvgPeaks_*.RData'
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R \
+                --sample_id $sample_id \
+                --tech_reps $tech_reps \
+                --scanmode $scanmode \
+                --preprocessing_scripts_dir $params.preprocessing_scripts_dir
+        """
+}
diff --git a/DIMS/preprocessing/average_peaks_functions.R b/DIMS/preprocessing/average_peaks_functions.R
new file mode 100644
index 00000000..beed29bb
--- /dev/null
+++ b/DIMS/preprocessing/average_peaks_functions.R
@@ -0,0 +1,41 @@
+average_peaks_per_sample <- function(peaklist_allrepl_sorted, sample_name) {
+  #' Average the intensity of peaks that occur in different technical replicates of a biological sample
+  #'
+  #' @param peaklist_allrepl_sorted: Dataframe with peaks sorted on median m/z (float)
+  #'
+  #' @return averaged_peaks: matrix of averaged peaks (float)
+  
+  # initialize
+  averaged_peaks <- peaklist_allrepl_sorted[0, ]
+  # set ppm as fixed value, not the same ppm as in peak grouping
+  ppm_peak <- 2
+
+  while (nrow(peaklist_allrepl_sorted) > 1) {
+    # store row numbers
+    peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted)
+    # find the peaks in the dataset with corresponding m/z plus or minus tolerance
+    reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1]
+    mz_tolerance <- (reference_mass * ppm_peak) / 10^6
+    minmz_ref <- reference_mass - mz_tolerance
+    maxmz_ref <- reference_mass + mz_tolerance
+    select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref))
+    select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ]
+    nrsamples <- length(select_peak_indices)
+    # put averaged intensities into a new row
+    averaged_1peak <- matrix(0, nrow = 1, ncol = 6) 
+    colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+    # calculate m/z values for peak group
+    averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt)
+    averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt)
+    averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt)
+    averaged_1peak[1, "fq"] <- nrsamples
+    averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt)
+    # remove rownr column and append to averaged_peaks
+    peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ]
+    averaged_peaks <- rbind(averaged_peaks, averaged_1peak)
+  }
+  # add sample name to first column
+  averaged_peaks[ , "samplenr"] <- sample_name
+  
+  return(averaged_peaks)
+}

From b8ea927def4749b3c06b0192e42e5128e236b826 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 6 Mar 2026 13:58:54 +0100
Subject: [PATCH 22/28] fixed issue with sample_name in DIMS/AveragePeaks step

---
 DIMS/AveragePeaks.R  | 7 +++----
 DIMS/AveragePeaks.nf | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R
index bdfc6cde..b6dbb69b 100644
--- a/DIMS/AveragePeaks.R
+++ b/DIMS/AveragePeaks.R
@@ -4,7 +4,7 @@ library("argparse")
 
 parser <- ArgumentParser(description = "AveragePeaks")
 
-parser$add_argument("--sample_id", dest = "sample_name",
+parser$add_argument("--sample_name", dest = "sample_name",
                     help = "Name of a biological sample", required = TRUE)
 parser$add_argument("--tech_reps", dest = "tech_reps",
                     help = "Names of the technical replicates belonging to the biological sample", required = TRUE)
@@ -16,7 +16,6 @@ parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts
 args <- parser$parse_args()
 
 # define parameters
-sample_name <- args$sample_id
 tech_reps <- strsplit(args$tech_reps, ";")[[1]]
 
 # load in function scripts
@@ -42,5 +41,5 @@ peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt)
 peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt)
 
 # average over technical replicates
-averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, sample_name)
-save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", args$scanmode, ".RData"))
+averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, args$sample_name)
+save(averaged_peaks, file = paste0("AvgPeaks_", args$sample_name, "_", args$scanmode, ".RData"))
diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf
index 65c136ec..9a7e5824 100644
--- a/DIMS/AveragePeaks.nf
+++ b/DIMS/AveragePeaks.nf
@@ -14,7 +14,7 @@ process AveragePeaks {
     script:
         """
         Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R \
-                --sample_id $sample_id \
+                --sample_name $sample_id \
                 --tech_reps $tech_reps \
                 --scanmode $scanmode \
                 --preprocessing_scripts_dir $params.preprocessing_scripts_dir

From bfc997fe4129c74293a4c20deb60ddb69fde0f0d Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 6 Mar 2026 16:02:51 +0100
Subject: [PATCH 23/28] added argument parsing for DIMS/PeakGrouping step

---
 DIMS/PeakGrouping.R  | 38 +++++++++++++++++++++++---------------
 DIMS/PeakGrouping.nf |  8 ++++++--
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R
index 4435b033..93400881 100644
--- a/DIMS/PeakGrouping.R
+++ b/DIMS/PeakGrouping.R
@@ -1,41 +1,49 @@
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
-
 # load required packages
 library("dplyr")
+library("argparse")
+
+parser <- ArgumentParser(description = "PeakGrouping")
+
+parser$add_argument("--hmdbpart_file", dest = "hmdbpart_file",
+                    help = "RData file containing part of HMDB database (plus adducts and isotopes)", required = TRUE)
+parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
+                    help = "File path to the directory containing functions used", required = TRUE)
+parser$add_argument("--ppm", dest = "ppm",
+                    help = "Value for ppm deviation allowed in peak group annotation", required = TRUE)
+
+args <- parser$parse_args()
 
-hmdb_part_file <- cmd_args[1]
-preprocessing_scripts_dir <- cmd_args[2]
-ppm <- as.numeric(cmd_args[3])
+hmdbpart_file <- args$hmdbpart_file
+ppm <- as.numeric(args$ppm)
 
 # load in function scripts
-source(paste0(preprocessing_scripts_dir, "peak_grouping_functions.R"))
+source(paste0(args$preprocessing_scripts_dir, "peak_grouping_functions.R"))
 
 options(digits = 16)
 
 # load part of the HMDB
-hmdb_add_iso <- get(load(hmdb_part_file))
+hmdb_add_iso <- get(load(args$hmdbpart_file))
 # make sure any internal standard entries are at the top of the dataframe
 internal_standard_indices <- grep("\\(IS\\)", hmdb_add_iso$CompoundName)
 if (length(internal_standard_indices) > 0) {
   hmdb_add_iso <- rbind(hmdb_add_iso[internal_standard_indices, ], hmdb_add_iso[-internal_standard_indices, ])
 }
 
-# determine appropriate scanmode based on hmdb_part_file
-if (grepl("negative", basename(hmdb_part_file))) {
+# determine appropriate scanmode based on hmdbpart_file
+if (grepl("negative", basename(hmdbpart_file))) {
   scanmode <- "negative"
   column_label <- "MNeg"
-} else if (grepl("positive", basename(hmdb_part_file))) {
+} else if (grepl("positive", basename(hmdbpart_file))) {
   scanmode <- "positive"
   column_label <- "Mpos"
 }
 
 # determine batch number of HMDB part file
-batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2]
+batch_number <- strsplit(basename(hmdbpart_file), ".", fixed = TRUE)[[1]][2]
 
-# load file with spectrum peaks
-spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData")
-load(spec_peaks_file)
+# load file with averaged peaks
+avg_peaks_file <- paste0("AvgPeaks_", scanmode, ".RData")
+load(avg_peaks_file)
 outlist_df <- as.data.frame(outlist_total)
 outlist_df$mzmed.pkt <- as.numeric(outlist_df$mzmed.pkt)
 outlist_df$height.pkt <- as.numeric(outlist_df$height.pkt)
diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf
index 7f3f17e4..d10407f5 100644
--- a/DIMS/PeakGrouping.nf
+++ b/DIMS/PeakGrouping.nf
@@ -6,13 +6,17 @@ process PeakGrouping {
 
     input:
        path(hmdbpart_file)
-       each path(spectrumpeak_file)
+       path(averagedpeaks_file)
+       each path(pattern_file)
 
     output:
        path '*_identified.RData', emit: grouped_identified
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $hmdbpart_file $params.preprocessing_scripts_dir $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R \
+                --hmdbpart_file $hmdbpart_file \
+                --preprocessing_scripts_dir $params.preprocessing_scripts_dir \
+                --ppm $params.ppm
         """
 }

From 9ee9b1be90079b636ed014fe8965e75e5f3adcd8 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 6 Mar 2026 16:16:27 +0100
Subject: [PATCH 24/28] added argument parsing for DIMS/CollectAveraged step

---
 DIMS/CollectAveraged.R  | 13 +++++++++++++
 DIMS/CollectAveraged.nf | 17 +++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 DIMS/CollectAveraged.R
 create mode 100644 DIMS/CollectAveraged.nf

diff --git a/DIMS/CollectAveraged.R b/DIMS/CollectAveraged.R
new file mode 100644
index 00000000..68f3ab0c
--- /dev/null
+++ b/DIMS/CollectAveraged.R
@@ -0,0 +1,13 @@
+# for each scan mode, collect all averaged peak lists per biological sample
+scanmodes <- c("positive", "negative")
+for (scanmode in scanmodes) {
+  # get list of files
+  filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, ".RData"))
+  # load files and combine into one object
+  outlist_total <- NULL
+  for (file_nr in 1:length(filled_files)) {
+    peaklist_averaged <- get(load(filled_files[file_nr]))
+    outlist_total <- rbind(outlist_total, peaklist_averaged)
+  }
+  save(outlist_total, file = paste0("AvgPeaks_", scanmode, ".RData"))
+}
diff --git a/DIMS/CollectAveraged.nf b/DIMS/CollectAveraged.nf
new file mode 100644
index 00000000..2f3d1a70
--- /dev/null
+++ b/DIMS/CollectAveraged.nf
@@ -0,0 +1,17 @@
+process CollectAveraged {
+    tag "DIMS CollectAveraged"
+    label 'CollectAveraged'
+    container = 'docker://umcugenbioinf/dims:1.4'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(averaged_files)
+
+    output:
+       path('AvgPeaks*.RData'), emit: averaged_peaks
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/CollectAveraged.R
+        """
+}

From d546a7729abcf68c4c72047b3e88dd0d968c2d82 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 6 Mar 2026 16:48:49 +0100
Subject: [PATCH 25/28] added argument parsing for DIMS/FillMissing step

---
 DIMS/FillMissing.R                          | 42 ++++++++++-----------
 DIMS/FillMissing.nf                         |  5 ++-
 DIMS/preprocessing/fill_missing_functions.R | 36 ++++++++++++++++++
 3 files changed, 60 insertions(+), 23 deletions(-)
 create mode 100644 DIMS/preprocessing/fill_missing_functions.R

diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index a76c163b..e349fc0e 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -1,25 +1,22 @@
-## adapted from 9-runFillMissing.R
-
 # define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+library("argparse")
+
+parser <- ArgumentParser(description = "FillMissing")
+
+parser$add_argument("--peakgrouplist_file", dest = "peakgrouplist_file",
+                    help = "Peak group list file", required = TRUE)
+parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
+                    help = "File path to the directory containing functions used", required = TRUE)
+parser$add_argument("--thresh_noise", dest = "thresh_noise",
+                    help = "Threshold value for noise on peak level (typically 2000)", required = TRUE)
+
+args <- parser$parse_args()
 
-peakgrouplist_file <- cmd_args[1]
-scripts_dir <- cmd_args[2]
-thresh <- as.numeric(cmd_args[3])
-resol <- as.numeric(cmd_args[4])
-ppm <- as.numeric(cmd_args[5])
-outdir <- "./"
+peakgrouplist_file <- args$peakgrouplist_file
+thresh <- as.numeric(args$thresh_noise)
 
 # load in function scripts
-source(paste0(scripts_dir, "replace_zeros.R"))
-source(paste0(scripts_dir, "fit_optim.R"))
-source(paste0(scripts_dir, "get_fwhm.R"))
-source(paste0(scripts_dir, "get_stdev.R"))
-source(paste0(scripts_dir, "estimate_area.R"))
-source(paste0(scripts_dir, "optimize_gaussfit.R"))
-source(paste0(scripts_dir, "identify_noisepeaks.R"))
-source(paste0(scripts_dir, "get_element_info.R"))
-source(paste0(scripts_dir, "atomic_info.R"))
+source(paste0(args$preprocessing_scripts_dir, "fill_missing_functions.R"))
 
 # determine scan mode
 if (grepl("_pos", peakgrouplist_file)) {
@@ -33,12 +30,13 @@ pattern_file <- paste0(scanmode, "_repl_pattern.RData")
 repl_pattern <- get(load(pattern_file))
 
 # load peak group list and determine output file name
-outpgrlist_identified <- get(load(peakgrouplist_file))
-
-outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
+peakgroup_list <- get(load(peakgrouplist_file))
 
 # replace missing values (zeros) with random noise
-peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
+peakgrouplist_filled <- fill_missing_intensities(peakgroup_list, repl_pattern, thresh)
+
+# set name of output file
+outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
 # save output
 save(peakgrouplist_filled, file = outputfile_name)
diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf
index 7678c95e..d52c31ce 100644
--- a/DIMS/FillMissing.nf
+++ b/DIMS/FillMissing.nf
@@ -13,6 +13,9 @@ process FillMissing {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R \
+                --peakgrouplist_file $peakgrouplist_file \
+                --preprocessing_scripts_dir $params.preprocessing_scripts_dir \
+                --thresh_noise $params.thresh
         """
 }
diff --git a/DIMS/preprocessing/fill_missing_functions.R b/DIMS/preprocessing/fill_missing_functions.R
new file mode 100644
index 00000000..e1c455d5
--- /dev/null
+++ b/DIMS/preprocessing/fill_missing_functions.R
@@ -0,0 +1,36 @@
+fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, not_random = FALSE) {
+  #' Replace intensities that are zero with random value
+  #'
+  #' @param peakgroup_list: Peak groups (matrix)
+  #' @param repl_pattern: Replication pattern (list of strings)
+  #' @param thresh: Value for threshold between noise and signal (integer)
+  #'
+  #' @return final_outlist: peak groups with filled-in intensities (matrix)
+
+  # for unit test, turn off randomness
+  if (not_random) {
+    set.seed(123)
+  }
+
+  # replace missing intensities with random values around threshold
+  if (!is.null(peakgroup_list)) {
+    for (sample_index in seq_along(names(repl_pattern))) {
+      sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]]
+      zero_intensity <- which(sample_peaks <= 0)
+      if (!length(zero_intensity)) {
+        next
+      }
+      for (zero_index in seq_along(zero_intensity)) {
+        peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1,
+                                                                                               mean = thresh,
+                                                                                               sd = 100)
+      }
+    }
+
+    # Add column with average intensity; find intensity columns first
+    int_cols <- which(colnames(peakgroup_list) %in% names(repl_pattern))
+    peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, int_cols], 1, mean))
+
+    return(peakgroup_list)
+  }
+}

From 4ed43c484b059c339af0f8e4961d9c41e81627a8 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 6 Mar 2026 16:49:55 +0100
Subject: [PATCH 26/28] added argument parsing for DIMS/CollectFilled step

---
 DIMS/CollectFilled.R                          |  30 +++--
 DIMS/CollectFilled.nf                         |   7 +-
 DIMS/preprocessing/collect_filled_functions.R | 118 ++++++++++++++++++
 3 files changed, 139 insertions(+), 16 deletions(-)
 create mode 100644 DIMS/preprocessing/collect_filled_functions.R

diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index 265db5d3..a0e4136f 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -1,17 +1,21 @@
-# load package
+# define parameters
 library("argparse")
 
 parser <- ArgumentParser(description = "CollectFilled")
 
-parser$add_argument("--scripts_dir", dest = "scripts_dir",
+parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
                     help = "File path to the directory containing functions used in this script", required = TRUE)
-parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer",
-                    help = "Numeric value, z-score = 1", required = TRUE)
+parser$add_argument("--ppm_value", dest = "ppm_value",
+                    help = "Value for allowed ppm deviation (e.g. 5)", required = TRUE)
+parser$add_argument("--z_score", dest = "z_score",
+                    help = "Boolean indicating whether Z-scores should be calculated (1) or not (0)", required = TRUE)
 
 args <- parser$parse_args()
 
-source(paste0(args$scripts_dir, "merge_duplicate_rows.R"))
-source(paste0(args$scripts_dir, "calculate_zscores.R"))
+ppm <- as.numeric(args$ppm_value)
+z_score <- as.numeric(args$z_score)
+
+source(paste0(args$preprocessing_scripts_dir, "collect_filled_functions.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
@@ -32,8 +36,8 @@ for (scanmode in scanmodes) {
   pattern_file <- paste0(scanmode, "_repl_pattern.RData")
   repl_pattern <- get(load(pattern_file))
   # calculate Z-scores
-  if (args$z_score == 1) {
-    outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE)
+  if (z_score == 1) {
+    outlist_stats <- calculate_zscores(outlist_total)
     nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
     order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
     outlist_stats_more <- cbind(
@@ -53,11 +57,11 @@ for (scanmode in scanmodes) {
 
   # make a copy of the outlist
   outlist_ident <- outlist_total
-  # take care of NAs in theormz_noise
-  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
-  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
-  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
-  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
+  # select identified peak groups if ppm deviation is within limits
+  if (z_score == 1) {
+    outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev)
+    outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ]
+  }
 
   # Extra output in Excel-readable format:
   remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf
index 1f174448..2773c1f5 100644
--- a/DIMS/CollectFilled.nf
+++ b/DIMS/CollectFilled.nf
@@ -14,8 +14,9 @@ process CollectFilled {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R \\
-            --scripts_dir $params.scripts_dir \\
-            --z_score $params.zscore
+        Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R \
+                --preprocessing_scripts_dir $params.preprocessing_scripts_dir \
+                --ppm_value $params.ppm \
+                --z_score $params.zscore
         """
 }
diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R
new file mode 100644
index 00000000..979dd1d1
--- /dev/null
+++ b/DIMS/preprocessing/collect_filled_functions.R
@@ -0,0 +1,118 @@
+# CollectFilled functions
+
+collapse <- function(column_label, peakgroup_list, index_dup) {
+  #' Collapse identification info for peak groups with the same mass
+  #'
+  #' @param column_label: Name of column in peakgroup_list (string)
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param index_dup: Index of duplicate peak group (integer)
+  #'
+  #' @return collapsed_items: Semicolon-separated list of info (string)
+  # get the item(s) that need to be collapsed
+  list_items <- as.vector(peakgroup_list[index_dup, column_label])
+  # remove NA
+  if (length(which(is.na(list_items))) > 0) {
+    list_items <- list_items[-which(is.na(list_items))]
+  }
+  collapsed_items <- paste(list_items, collapse = ";")
+  return(collapsed_items)
+}
+
+merge_duplicate_rows <- function(peakgroup_list) {
+  #' Merge identification info for peak groups with the same mass
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  options(digits = 16)
+  collect <- NULL
+  remove <- NULL
+
+  # check for peak groups with identical mass
+  index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"]))
+
+  while (length(index_dup) > 0) {
+    # get the index for the peak group which is double
+    peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])
+    single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE]
+
+    # use function collapse to concatenate info
+    single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
+    single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
+    single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
+    if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA
+
+    # keep track of deduplicated entries
+    collect <- rbind(collect, single_peakgroup)
+    remove <- c(remove, peaklist_index)
+
+    # remove current entry from index
+    index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])]
+  }
+
+  # remove duplicate entries
+  if (!is.null(remove)) {
+    peakgroup_list <- peakgroup_list[-remove, ]
+  }
+  # append deduplicated entries
+  peakgroup_list_dedup <- rbind(peakgroup_list, collect)
+  return(peakgroup_list_dedup)
+}
+
+calculate_zscores <- function(peakgroup_list) {
+  #' Calculate Z-scores for peak groups based on average and standard deviation of controls
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param sort_col: Column to sort on (string)
+  #' @param adducts: Parameter indicating whether there are adducts in the list (boolean)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  case_label <- "P"
+  control_label <- "C"
+  # get index for new column names
+  startcol <- ncol(peakgroup_list) + 3
+
+  # calculate mean and standard deviation for Control group
+  ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
+  case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE)
+  int_cols <- c(ctrl_cols, case_cols)
+  # set all zeros to NA
+  peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA
+  ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE]
+  peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE))
+  peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE))
+
+  # set new column names and calculate Z-scores
+  colnames_zscores <- NULL
+  for (col_index in int_cols) {
+    col_name <- colnames(peakgroup_list)[col_index]
+    colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore"))
+    zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) -
+                     peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls
+    peakgroup_list <- cbind(peakgroup_list, zscores_1col)
+  }
+
+  # apply new column names to columns at end plus avg and sd columns
+  colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores
+
+  # add ppm deviation column
+  zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE)
+  # calculate ppm deviation
+  for (row_index in seq_len(nrow(peakgroup_list))) {
+    if (!is.na(peakgroup_list$theormz_HMDB[row_index]) &&
+        !is.null(peakgroup_list$theormz_HMDB[row_index]) &&
+        (peakgroup_list$theormz_HMDB[row_index] != "")) {
+      peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) -
+                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) /
+                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))
+    } else {
+      peakgroup_list$ppmdev[row_index] <- NA
+    }
+  }
+
+  return(peakgroup_list)
+}

From 7b0d9b68cec9b26d1d6dcbb6aed7307142bbd0cf Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 6 Mar 2026 17:19:03 +0100
Subject: [PATCH 27/28] added argument parsing for DIMS/SumAdducts step

---
 DIMS/SumAdducts.R  | 23 +++++++++++++++++------
 DIMS/SumAdducts.nf |  5 ++++-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R
index 489ae179..bfd40161 100644
--- a/DIMS/SumAdducts.R
+++ b/DIMS/SumAdducts.R
@@ -1,13 +1,24 @@
+# load libraries
 library("dplyr")
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
+library("argparse")
+
+parser <- ArgumentParser(description = "SumAdducts")
+
+parser$add_argument("--hmdbpart_main_file", dest = "hmdbpart_main_file",
+                    help = "RData file containing part of HMDB database (only main metabolites)", required = TRUE)
+parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
+                    help = "File path to the directory containing functions used", required = TRUE)
+parser$add_argument("--z_score", dest = "z_score",
+                    help = "Boolean indicating whether Z-scores should be calculated (1) or not (0)", required = TRUE)
 
-hmdbpart_main_file <- cmd_args[1]
-scripts_dir <- cmd_args[2]
-z_score <- as.numeric(cmd_args[3])
+args <- parser$parse_args()
+
+# define parameters
+hmdbpart_main_file <- args$hmdbpart_main_file
+z_score <- as.numeric(args$z_score)
 
 # load in function scripts
-source(paste0(scripts_dir, "sum_intensities_adducts.R"))
+source(paste0(args$preprocessing_scripts_dir, "sum_intensities_adducts.R"))
 
 if (grepl("positive_hmdb", hmdbpart_main_file)) {
   scanmode <- "positive"
diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf
index a667e6e0..6b67c6a6 100644
--- a/DIMS/SumAdducts.nf
+++ b/DIMS/SumAdducts.nf
@@ -13,6 +13,9 @@ process SumAdducts {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/SumAdducts.R $hmdbpart_main_file $params.preprocessing_scripts_dir $params.zscore
+        Rscript ${baseDir}/CustomModules/DIMS/SumAdducts.R \
+                --hmdbpart_main_file $hmdbpart_main_file \
+                --preprocessing_scripts_dir $params.preprocessing_scripts_dir \
+                --z_score $params.zscore
         """
 }

From d62a7025b77328b63379b3113885303db6e9b0ac Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 6 Mar 2026 17:28:10 +0100
Subject: [PATCH 28/28] added argument parsing for DIMS/CollectSumAdducts step

---
 DIMS/CollectSumAdducts.R  | 12 ++++++------
 DIMS/CollectSumAdducts.nf |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R
index 10752add..2bf4cf0b 100755
--- a/DIMS/CollectSumAdducts.R
+++ b/DIMS/CollectSumAdducts.R
@@ -1,12 +1,12 @@
-## Combining all AdductSums part files for each scanmode and
-# combine intensities if present in both scanmodes
-library("argparse")
+# load libraries
 suppressMessages(library("dplyr"))
+library("argparse")
 
-parser <- ArgumentParser(description = "GenerateExcel")
+# define parameters
+parser <- ArgumentParser(description = "CollectSumAdducts")
 
 parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir",
-                    help = "File path to the directory containing functions used in this script", required = TRUE)
+                    help = "File path to the directory containing functions used", required = TRUE)
 
 args <- parser$parse_args()
 
@@ -18,6 +18,6 @@ save(outlist_tot_pos, file = "AdductSums_positive.RData")
 outlist_tot_neg <- combine_sum_adduct_parts("negative")
 save(outlist_tot_neg, file = "AdductSums_negative.RData")
 
-# combine intensities of both scanmodi
+# combine intensities of both scan modes
 outlist <- combine_scanmodes_intensities(outlist_tot_pos, outlist_tot_neg)
 save(outlist, file = "AdductSums_combined.RData")
diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf
index 06aa82c8..ee8ec229 100644
--- a/DIMS/CollectSumAdducts.nf
+++ b/DIMS/CollectSumAdducts.nf
@@ -13,7 +13,7 @@ process CollectSumAdducts {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R \\
-        --preprocessing_scripts_dir $params.preprocessing_scripts_dir
+        Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R \
+                --preprocessing_scripts_dir $params.preprocessing_scripts_dir
         """
 }