UMCUGenetics · mraves2 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
diff --git a/DIMS/ParseSamplesheet.R b/DIMS/ParseSamplesheet.R
@@ -0,0 +1,19 @@
+# define parameters
+args <- commandArgs(trailingOnly = TRUE)
+
+sample_sheet <- as.data.frame(read.csv(args[1], sep = "\t"))
+preprocessing_scripts_dir <- args[2]
+
+# load in function script
+source(paste0(preprocessing_scripts_dir, "parse_samplesheet_functions.R"))
+
+# generate the replication pattern
+repl_pattern <- generate_repl_pattern(sample_sheet)
+
+# write the replication pattern to text file for troubleshooting purposes
+sink("replication_pattern.txt")
+print(repl_pattern)
+sink()
+
+# save replication pattern to file
+save(repl_pattern, file = "init.RData")
diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf
@@ -0,0 +1,18 @@
+process ParseSamplesheet {
+    tag "DIMS ParseSamplesheet"
+    label 'ParseSamplesheet'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
-    shell = ['/bin/bash', '-euo', 'pipefail']
+ 
-    shell = ['/bin/bash', '-euo', 'pipefail']
+ 
+
+    input:
+       path(samplesheet) 
+
+    output:
+       path('init.RData')
+       path('replication_pattern.txt')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/ParseSamplesheet.R $samplesheet $params.preprocessing_scripts_dir
+        """
+}
diff --git a/DIMS/preprocessing/parse_samplesheet_functions.R b/DIMS/preprocessing/parse_samplesheet_functions.R
@@ -0,0 +1,29 @@
+# function for parse_samplesheet
+generate_repl_pattern <- function(sample_sheet) {
+  #' Generate replication pattern list based on information in sample_sheet
+  #'
+  #' @param sample_names: vector of sample names (vector of strings)
+  #' @param sample_sheet: matrix of file names and sample names
+  #'
+  #' @return ints_sorted: list of sample names with corresponding file names (technical replicates)
+
+  # get the right columns from the samplesheet
+  file_name_col <- grep("File_Name|File Name", colnames(sample_sheet))
+  sample_name_col <- grep("Sample_Name|Sample Name", colnames(sample_sheet))
+  # get the unique sample names from the samplesheet
+  sample_names <- sort(unique(trimws(as.vector(unlist(sample_sheet[sample_name_col])))))
-  sample_names <- sort(unique(trimws(as.vector(unlist(sample_sheet[sample_name_col])))))
+sample_names <- sample_sheet[sample_name_col] |>
+    unlist() |>
+    as.vector() |>
+    trimws() |>
+    unique() |>
+    sort()
-  sample_names <- sort(unique(trimws(as.vector(unlist(sample_sheet[sample_name_col])))))
+sample_names <- sample_sheet[sample_name_col] |>
+    unlist() |>
+    as.vector() |>
+    trimws() |>
+    unique() |>
+    sort()
+  # remove all characters from sample_names which are not letters, numbers, hyphens and periods
+  sample_names <- gsub("[^-.[:alnum:]]", "_", sample_names)
+
+  # create replication pattern (which technical replicates belong to which sample)
+  repl_pattern <- c()
+  for (sample_group in sample_names) {
+    file_indices <- which(sample_sheet[, sample_name_col] == sample_group)
+    file_names <- sample_sheet[file_indices, file_name_col]
+    repl_pattern <- c(repl_pattern, list(file_names))
+  }
-  repl_pattern <- c()
-  for (sample_group in sample_names) {
-    file_indices <- which(sample_sheet[, sample_name_col] == sample_group)
-    file_names <- sample_sheet[file_indices, file_name_col]
-    repl_pattern <- c(repl_pattern, list(file_names))
-  }
+repl_pattern <- split(
+  sample_sheet[[file_name_col]],
+  sample_sheet[[sample_name_col]]
+)[sample_names]
-  repl_pattern <- c()
-  for (sample_group in sample_names) {
-    file_indices <- which(sample_sheet[, sample_name_col] == sample_group)
-    file_names <- sample_sheet[file_indices, file_name_col]
-    repl_pattern <- c(repl_pattern, list(file_names))
-  }
+repl_pattern <- split(
+  sample_sheet[[file_name_col]],
+  sample_sheet[[sample_name_col]]
+)[sample_names]
+  names(repl_pattern) <- sample_names
+
+  return(repl_pattern)
+}
+
diff --git a/DIMS/tests/testthat/parse_samplesheet_functions.R b/DIMS/tests/testthat/parse_samplesheet_functions.R
@@ -0,0 +1,24 @@
+# unit tests for ParseSamplesheet
+# function: generate_repl_pattern
+
+# source all functions for PeakGrouping
+source("../../preprocessing/parse_samplesheet_functions.R")
-# source all functions for PeakGrouping
-source("../../preprocessing/parse_samplesheet_functions.R")
+# source all functions for PeakGrouping
+source("../preprocessing/parse_samplesheet_functions.R")
-# source all functions for PeakGrouping
-source("../../preprocessing/parse_samplesheet_functions.R")
+# source all functions for PeakGrouping
+source("../preprocessing/parse_samplesheet_functions.R")
+
+# test generate_repl_pattern
+testthat::test_that("replication pattern is correctly generated", {
+  # create sample sheet tot test on:
+  test_file_names <- paste0(rep("RES_20260101_", 6), sprintf("%03d", 1:6))
+  test_sample_names <- sort(rep(c("C1", "P2", "P3"), 2))
+  test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))
+
+  # test that a list of length 3 is generated
+  expect_length(generate_repl_pattern(test_sample_sheet), 3)
+  # test list names
+  expect_equal(names(generate_repl_pattern(test_sample_sheet)), unique(test_sample_names), TRUE)
+
+  # test what happens if any sample name is used twice
+  test_sample_names <- gsub("P3", "P2", test_sample_names)
+  test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))
+  expect_length(generate_repl_pattern(test_sample_sheet), 2)
+  expect_length(generate_repl_pattern(test_sample_sheet)$P2, 4)
+})