Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
*
.*

!/catalogs/
!/catalogs/*
Expand All @@ -14,4 +12,6 @@
!/nextflow.config
!/scripts/
!/scripts/**
**/__pycache__/
**/__pycache__/
/scripts/VCF_comparisons/__pycache__
.Rproj.user
70 changes: 65 additions & 5 deletions align_HPRC/filter_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,22 @@ def main(argv=None):
parser.add_argument('--infile', '-i', required=True, help='Input metadata CSV file')
parser.add_argument('--outfile', '-o', required=True, help='Output filtered metadata CSV file')
parser.add_argument('--countfile', '-c', required=True, help='Output file to write count of filtered samples')
parser.add_argument('--num_samples', type=int, default=None, help='Optional maximum number of samples to keep (default: no limit)')
parser.add_argument('--chemistry', type=str, default=None, help='Optional sequencing chemistry to filter by (e.g. "R9" or "R10", default: no chemistry filter)')
parser.add_argument('--min_cov', type=float, default=10.0, help='Minimum coverage threshold (default: 10.0)')
parser.add_argument('--max_cov', type=float, default=1000.0, help='Maximum coverage threshold (default: 1000.0)')
parser.add_argument('--keep_cols', nargs='*', default=['sample_ID', 'coverage', 'path'],
parser.add_argument('--keep_cols', nargs='*', default=['sample_ID', 'coverage', 'path', 'sequencing_chemistry'],
help='List of columns to keep. First three must be sample_ID, coverage, file path, or equivalent.')

args = parser.parse_args(argv)
# sequencing_chemistry column check if values starts with R9 or R10 and ignore additional values after that

filter_metadata(infile=args.infile, outfile=args.outfile, countfile=args.countfile, min_cov=args.min_cov, max_cov=args.max_cov, keep_cols=args.keep_cols)
filter_metadata(infile=args.infile, outfile=args.outfile, countfile=args.countfile, min_cov=args.min_cov, max_cov=args.max_cov,
chemistry=args.chemistry, keep_cols=args.keep_cols, num_samples=args.num_samples)

def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', countfile='filtered_count.txt', min_cov=10.0, max_cov=1000.0, keep_cols=None):

def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', countfile='filtered_count.txt', min_cov=10.0, max_cov=1000.0,
chemistry=None, keep_cols=None, num_samples=None):
"""Filter metadata CSV based on coverage.

Keeps the highest-coverage row per sample where coverage >= min_cov.
Expand All @@ -26,6 +32,51 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
best sample; otherwise rows without coverage are skipped.
"""
skip_samples = ["HG002", "HG005"]
# Previously downloaded
skip_samples += [
"HG00329",
"HG00408",
"HG00597",
"HG00609",
"HG00639",
"HG01081",
"HG01255",
"HG01346",
"HG01433",
"HG01496",
"HG01786",
"HG01975",
"HG01993",
"HG02004",
"HG02040",
"HG02071",
"HG02132",
"HG02135",
"HG02280",
"HG02293",
"HG02572",
"HG02602",
"HG02630",
"HG02698",
"HG02723",
"HG02922",
"HG02984",
"HG03017",
"HG03209",
"HG03453",
"HG03521",
"HG03710",
"HG03804",
"HG03816",
"HG04157",
"HG04228",
"NA18505",
"NA19682",
"NA19776",
"NA19835",
"NA20346",
"NA21110"
]

newheader = list(keep_cols) + ['readgroup']
if len(newheader) < 3:
Expand All @@ -50,6 +101,7 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
idx_id = idx_map[id_col]
idx_path = idx_map[path_col]
idx_cov = idx_map[cov_col] if cov_col in header else -1
idx_chemistry = header.index('sequencing_chemistry') if 'sequencing_chemistry' in header else -1

rows = []
for row in reader[1:]:
Expand All @@ -72,6 +124,11 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
continue
if cov is None or cov < float(min_cov) or cov > float(max_cov):
continue
# Filter by chemistry if specified
if chemistry is not None and idx_chemistry != -1:
chem = (row[idx_chemistry].strip() if idx_chemistry < len(row) else '').strip()
if not chem.startswith(chemistry):
continue
# store the full original row so we can output requested columns
rows.append((sid, cov, path, row))

Expand All @@ -85,7 +142,10 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
with open(outfile, 'w', newline='') as outf:
writer = csv.writer(outf)
writer.writerow(newheader)
for sid, (cov, row) in best.items():
best_items = list(best.items())
if num_samples is not None:
best_items = best_items[:num_samples]
for sid, (cov, row) in best_items:
cov_out = int(cov) if float(cov).is_integer() else cov
out_vals = []
for col in newheader:
Expand All @@ -109,7 +169,7 @@ def filter_metadata(infile='metadata.csv', outfile='filtered_metadata.csv', coun
writer.writerow(out_vals)

with open(countfile, 'w') as cf:
cf.write(str(len(best)) + "\n")
cf.write(str(len(best_items)) + "\n")

import re

Expand Down
33 changes: 22 additions & 11 deletions align_HPRC/main_align.nf
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ workflow {
ref = file(params.ref, checkIfExists: true)
fai = file("${params.ref}.fai", checkIfExists: true)

// fetch metadata and produce tuples (sample_ID, coverage, path, readgroup)
// fetch metadata and produce tuples (sample_ID, coverage, path, readgroup), current cols:sample_ID,coverage,path,sequencing_chemistry,readgroup
fetch_meta()

// Filter metadata inside a process so `filtered_metadata.csv` is produced and published.
Expand All @@ -50,15 +50,26 @@ workflow {
.map { t -> t[0] }
.flatMap { metaFile ->
def tuples = []
def headerMap = [:]
metaFile.eachLine { line, lineNum ->
if (lineNum == 1) return // skip header
// split into at most 3 parts: sample_id, coverage, path, readgroup
def parts = line.split(',', 4)
if (parts.size() >= 4) {
def sample_id = parts[0].trim()
def coverage = parts[1].trim()
def url = parts[2].trim()
def readgroup = parts[3].trim()
if (lineNum == 1) {
// Parse header to map column names to indices
def headers = line.split(',')
headers.eachWithIndex { header, idx ->
headerMap[header.trim()] = idx
}
return
}
// Parse data rows using column headings
def parts = line.split(',')
if (headerMap['sample_ID'] != null && headerMap['coverage'] != null &&
headerMap['path'] != null && headerMap['readgroup'] != null &&
parts.size() > [headerMap['sample_ID'], headerMap['coverage'],
headerMap['path'], headerMap['readgroup']].max()) {
def sample_id = parts[headerMap['sample_ID']].trim()
def coverage = parts[headerMap['coverage']].trim()
def url = parts[headerMap['path']].trim()
def readgroup = parts[headerMap['readgroup']].trim()
// only emit when we have both id and url
if (sample_id && url) tuples << tuple(sample_id, coverage, url, readgroup)
}
Expand Down Expand Up @@ -161,9 +172,9 @@ process filter_meta {
script:
"""
module load python
python /home/hdashnow@xsede.org/myprojects/git/nf-long-tr/filter_metadata.py \
python /home/hdashnow@xsede.org/myprojects/git/TR-Benchmarking/align_HPRC/filter_metadata.py \
--infile ${metadata_file} --outfile filtered_metadata.csv --countfile filtered_count.txt \
--min_cov 30 --max_cov 1000
--min_cov 25 --max_cov 1000 --chemistry R10 --num_samples 25
"""

}
Expand Down
Loading