dashnowlab · hdashnow · Feb 14, 2026 · Feb 14, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,3 @@
-*
-.*
 
 !/catalogs/
 !/catalogs/*
@@ -14,4 +12,6 @@
 !/nextflow.config
 !/scripts/
 !/scripts/**
-**/__pycache__/
+**/__pycache__/
+/scripts/VCF_comparisons/__pycache__
+.Rproj.user
diff --git a/align_HPRC/filter_metadata.py b/align_HPRC/filter_metadata.py
@@ -8,16 +8,22 @@ def main(argv=None):
     parser.add_argument('--infile', '-i', required=True, help='Input metadata CSV file')
     parser.add_argument('--outfile', '-o', required=True, help='Output filtered metadata CSV file')
     parser.add_argument('--countfile', '-c', required=True, help='Output file to write count of filtered samples')
+    parser.add_argument('--num_samples', type=int, default=None, help='Optional maximum number of samples to keep (default: no limit)')
+    parser.add_argument('--chemistry', type=str, default=None, help='Optional sequencing chemistry to filter by (e.g. "R9" or "R10", default: no chemistry filter)')
     parser.add_argument('--min_cov', type=float, default=10.0, help='Minimum coverage threshold (default: 10.0)')
     parser.add_argument('--max_cov', type=float, default=1000.0, help='Maximum coverage threshold (default: 1000.0)')
-    parser.add_argument('--keep_cols', nargs='*', default=['sample_ID', 'coverage', 'path'], 
+    parser.add_argument('--keep_cols', nargs='*', default=['sample_ID', 'coverage', 'path', 'sequencing_chemistry'],
                         help='List of columns to keep. First three must be sample_ID, coverage, file path, or equivalent.')
 
     args = parser.parse_args(argv)
+    # sequencing_chemistry column check if values starts with R9 or R10 and ignore additional values after that
 
-    filter_metadata(infile=args.infile, outfile=args.outfile, countfile=args.countfile, min_cov=args.min_cov, max_cov=args.max_cov, keep_cols=args.keep_cols)
+    filter_metadata(infile=args.infile, outfile=args.outfile, countfile=args.countfile, min_cov=args.min_cov, max_cov=args.max_cov,
+                    chemistry=args.chemistry, keep_cols=args.keep_cols, num_samples=args.num_samples)
 
-def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', countfile='filtered_count.txt', min_cov=10.0, max_cov=1000.0, keep_cols=None):
+
+def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', countfile='filtered_count.txt', min_cov=10.0, max_cov=1000.0,
+                    chemistry=None, keep_cols=None, num_samples=None):
     """Filter metadata CSV based on coverage.
 
     Keeps the highest-coverage row per sample where coverage >= min_cov.
@@ -26,6 +32,51 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
     best sample; otherwise rows without coverage are skipped.
     """
     skip_samples = ["HG002", "HG005"]
+    # Previously downloaded
+    skip_samples += [
+        "HG00329",
+        "HG00408",
+        "HG00597",
+        "HG00609",
+        "HG00639",
+        "HG01081",
+        "HG01255",
+        "HG01346",
+        "HG01433",
+        "HG01496",
+        "HG01786",
+        "HG01975",
+        "HG01993",
+        "HG02004",
+        "HG02040",
+        "HG02071",
+        "HG02132",
+        "HG02135",
+        "HG02280",
+        "HG02293",
+        "HG02572",
+        "HG02602",
+        "HG02630",
+        "HG02698",
+        "HG02723",
+        "HG02922",
+        "HG02984",
+        "HG03017",
+        "HG03209",
+        "HG03453",
+        "HG03521",
+        "HG03710",
+        "HG03804",
+        "HG03816",
+        "HG04157",
+        "HG04228",
+        "NA18505",
+        "NA19682",
+        "NA19776",
+        "NA19835",
+        "NA20346",
+        "NA21110"
+    ]
 
     newheader = list(keep_cols) + ['readgroup']
     if len(newheader) < 3:
@@ -50,6 +101,7 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
     idx_id = idx_map[id_col]
     idx_path = idx_map[path_col]
     idx_cov = idx_map[cov_col] if cov_col in header else -1
+    idx_chemistry = header.index('sequencing_chemistry') if 'sequencing_chemistry' in header else -1
 
     rows = []
     for row in reader[1:]:
@@ -72,6 +124,11 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
             continue
         if cov is None or cov < float(min_cov) or cov > float(max_cov):
             continue
+        # Filter by chemistry if specified
+        if chemistry is not None and idx_chemistry != -1:
+            chem = (row[idx_chemistry].strip() if idx_chemistry < len(row) else '').strip()
+            if not chem.startswith(chemistry):
+                continue
         # store the full original row so we can output requested columns
         rows.append((sid, cov, path, row))
 
@@ -85,7 +142,10 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
     with open(outfile, 'w', newline='') as outf:
         writer = csv.writer(outf)
         writer.writerow(newheader)
-        for sid, (cov, row) in best.items():
+        best_items = list(best.items())
+        if num_samples is not None:
+            best_items = best_items[:num_samples]
+        for sid, (cov, row) in best_items:
             cov_out = int(cov) if float(cov).is_integer() else cov
             out_vals = []
             for col in newheader:
@@ -109,7 +169,7 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
             writer.writerow(out_vals)
 
     with open(countfile, 'w') as cf:
-        cf.write(str(len(best)) + "\n")
+        cf.write(str(len(best_items)) + "\n")
 
 import re
 

diff --git a/align_HPRC/main_align.nf b/align_HPRC/main_align.nf
@@ -38,7 +38,7 @@ workflow {
     ref = file(params.ref, checkIfExists: true)
     fai = file("${params.ref}.fai", checkIfExists: true)
 
-    // fetch metadata and produce tuples (sample_ID, coverage, path, readgroup)
+    // fetch metadata and produce tuples (sample_ID, coverage, path, readgroup), current cols:sample_ID,coverage,path,sequencing_chemistry,readgroup
     fetch_meta()
 
     // Filter metadata inside a process so `filtered_metadata.csv` is produced and published.
@@ -50,15 +50,26 @@ workflow {
         .map { t -> t[0] }
         .flatMap { metaFile ->
             def tuples = []
+            def headerMap = [:]
             metaFile.eachLine { line, lineNum ->
-                if (lineNum == 1) return // skip header
-                // split into at most 3 parts: sample_id, coverage, path, readgroup
-                def parts = line.split(',', 4)
-                if (parts.size() >= 4) {
-                    def sample_id = parts[0].trim()
-                    def coverage = parts[1].trim()
-                    def url = parts[2].trim()
-                    def readgroup = parts[3].trim()
+                if (lineNum == 1) {
+                    // Parse header to map column names to indices
+                    def headers = line.split(',')
+                    headers.eachWithIndex { header, idx ->
+                        headerMap[header.trim()] = idx
+                    }
+                    return
+                }
+                // Parse data rows using column headings
+                def parts = line.split(',')
+                if (headerMap['sample_ID'] != null && headerMap['coverage'] != null && 
+                    headerMap['path'] != null && headerMap['readgroup'] != null &&
+                    parts.size() > [headerMap['sample_ID'], headerMap['coverage'], 
+                                    headerMap['path'], headerMap['readgroup']].max()) {
+                    def sample_id = parts[headerMap['sample_ID']].trim()
+                    def coverage = parts[headerMap['coverage']].trim()
+                    def url = parts[headerMap['path']].trim()
+                    def readgroup = parts[headerMap['readgroup']].trim()
                     // only emit when we have both id and url
                     if (sample_id && url) tuples << tuple(sample_id, coverage, url, readgroup)
                 }
@@ -161,9 +172,9 @@ process filter_meta {
     script:
     """
     module load python
-    python /home/hdashnow@xsede.org/myprojects/git/nf-long-tr/filter_metadata.py \
+    python /home/hdashnow@xsede.org/myprojects/git/TR-Benchmarking/align_HPRC/filter_metadata.py \
         --infile ${metadata_file} --outfile filtered_metadata.csv --countfile filtered_count.txt \
-        --min_cov 30 --max_cov 1000
+        --min_cov 25 --max_cov 1000 --chemistry R10 --num_samples 25
     """
 
 }