From 68478fe02a2dd547072005c9ec92791fbe2fd46e Mon Sep 17 00:00:00 2001 From: Audra Johnson Date: Thu, 2 May 2019 03:15:42 -0700 Subject: [PATCH 1/7] (feature) An ATAC focused aggregation process --- processes/bwa/aggregate/atac.nf | 739 ++++++++++++++++++ .../bwa/aggregate/basic/cutatacfragments.awk | 28 + 2 files changed, 767 insertions(+) create mode 100644 processes/bwa/aggregate/atac.nf create mode 100644 scripts/bwa/aggregate/basic/cutatacfragments.awk diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf new file mode 100644 index 00000000..42f26b6a --- /dev/null +++ b/processes/bwa/aggregate/atac.nf @@ -0,0 +1,739 @@ +params.help = false +params.threads = 1 + +params.UMI = false +params.genome = "" +params.outdir = "output" +params.domotifs = false +params.dofeatures = false + +params.readlength = 36 + +params.hotspot_index = "." + +params.bias = "" +params.chunksize = 5000 + +def helpMessage() { + log.info""" + Usage: nextflow run basic.nf \\ + --genome /path/to/genome \\ + --bams '1.bam,2.bam...' \\ + --UMI true/false \\ + --outdir /path/to/output + + """.stripIndent(); +} + +dataDir = "$baseDir/../../../data" +genome_name = file(params.genome).baseName + +bams = Channel.from( + params.bams.tokenize(',') +).map { + file(it) +}.collect() + + +process merge { + label "modules" + + input: + file 'in*.bam' from bams + + output: + file 'merged.bam' into merged + + publishDir params.outdir + + script: + """ + samtools merge 'merged.bam' in*.bam + """ +} + +// TODO: single end +process dups { + label "modules" + publishDir params.outdir + memory '16 GB' + + input: + file(merged) + + output: + file 'marked.bam' into marked_bam + file 'marked.bam.bai' + file 'MarkDuplicates.picard' + + script: + if (params.UMI) + cmd = "UmiAwareMarkDuplicatesWithMateCigar" + extra = "UMI_TAG_NAME=XD" + if (!params.UMI) + cmd = "MarkDuplicatesWithMateCigar" + extra = "" + """ + picard RevertOriginalBaseQualitiesAndAddMateCigar \ + "INPUT=${merged}" OUTPUT=cigar.bam \ + VALIDATION_STRINGENCY=SILENT RESTORE_ORIGINAL_QUALITIES=false SORT_ORDER=coordinate MAX_RECORDS_TO_EXAMINE=0 + + picard "${cmd}" \ + INPUT=cigar.bam OUTPUT=marked.bam \ + $extra \ + METRICS_FILE=MarkDuplicates.picard ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT \ + READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' \ + MINIMUM_DISTANCE=300 + + samtools index marked.bam + """ +} +marked_bam.into { bam_for_counts; bam_for_adapter_counts; bam_for_filter; bam_for_diff_peaks } + +process filter { + label "modules" + + publishDir params.outdir + + input: + file bam from bam_for_filter + + output: + file "filtered.bam" into filtered_bam + + script: + flag = params.UMI ? 1536 : 512 + """ + samtools view -b -F "${flag}" marked.bam > filtered.bam + """ +} +filtered_bam.into { bam_for_hotspot2; bam_for_spot_score; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprints} + +process filter_nuclear { + label "modules" + input: + file bam from bam_for_nuclear + file nuclear_chroms from file("${params.genome}.nuclear.txt") + + output: + file 'nuclear.bam' into nuclear_bam + + script: + """ + samtools index "${bam}" + cat "${nuclear_chroms}" \ + | xargs samtools view -b "${bam}" \ + > nuclear.bam + """ +} + +process hotspot2 { + label "modules" + + publishDir "${params.outdir}" + container "fwip/hotspot2:latest" + + input: + file(marked_bam) from bam_for_hotspot2 + file(mappable) from file(params.mappable) + file(chrom_sizes) from file(params.chrom_sizes) + file(centers) from file(params.centers) + + + output: + file('peaks/filtered*') + file('peaks/filtered.hotspots.fdr0.05.starch') into hotspot_calls + file('peaks/filtered.hotspots.fdr0.05.starch') into hotspot_calls_for_bias + file('peaks/filtered.peaks.fdr0.001.starch') into onepercent_peaks + + script: + """ + export TMPDIR=\$(mktemp -d) + hotspot2.sh -F 0.5 -p varWidth_20_default \ + -M "${mappable}" \ + -c "${chrom_sizes}" \ + -C "${centers}" \ + "${marked_bam}" \ + 'peaks' + + cd peaks + + # Rename peaks files to include FDR + mv filtered.peaks.narrowpeaks.starch filtered.peaks.narrowpeaks.fdr0.05.starch + mv filtered.peaks.starch filtered.peaks.fdr0.05.starch + + bash \$STAMPIPES/scripts/SPOT/info.sh \ + filtered.hotspots.fdr0.05.starch hotspot2 filtered.SPOT.txt \ + > filtered.hotspot2.info + + # TODO: Move this to separate process + hsmerge.sh -f 0.01 filtered.allcalls.starch filtered.hotspots.fdr0.01.starch + hsmerge.sh -f 0.001 filtered.allcalls.starch filtered.hotspots.fdr0.001.starch + + density-peaks.bash \$TMPDIR varWidth_20_default filtered.cutcounts.starch filtered.hotspots.fdr0.01.starch ../"${chrom_sizes}" filtered.density.starch filtered.peaks.fdr0.01.starch \$(cat filtered.cleavage.total) + density-peaks.bash \$TMPDIR varWidth_20_default filtered.cutcounts.starch filtered.hotspots.fdr0.001.starch ../"${chrom_sizes}" filtered.density.starch filtered.peaks.fdr0.001.starch \$(cat filtered.cleavage.total) + + rm -rf "\$TMPDIR" + """ + +} + +process spot_score { + label "modules" + publishDir params.outdir + + input: + file(bam) from bam_for_spot_score + file(mappable) from file("${dataDir}/annotations/${genome_name}.K${params.readlength}.mappable_only.bed") + file(chromInfo) from file("${dataDir}/annotations/${genome_name}.chromInfo.bed") + + output: + file 'r1.spot.out' + file 'r1.hotspot.info' + + script: + """ + # random sample + samtools view -h -F 12 -f 3 "$bam" \ + | awk '{if( ! index(\$3, "chrM") && \$3 != "chrC" && \$3 != "random"){print}}' \ + | samtools view -uS - \ + > nuclear.bam + bash \$STAMPIPES/scripts/bam/random_sample.sh nuclear.bam subsample.bam 5000000 + samtools view -b -f 0x0040 subsample.bam > r1.bam + + # hotspot + bash \$STAMPIPES/scripts/SPOT/runhotspot.bash \ + \$HOTSPOT_DIR \ + \$PWD \ + \$PWD/r1.bam \ + "${genome_name}" \ + "${params.readlength}" \ + DNaseI + + starch --header r1-both-passes/r1.hotspot.twopass.zscore.wig \ + > r1.spots.starch + + bash \$STAMPIPES/scripts/SPOT/info.sh \ + r1.spots.starch hotspot1 r1.spot.out \ + > r1.hotspot.info + """ +} + +process bam_counts { + label "modules" + publishDir params.outdir + + input: + file(bam) from bam_for_counts + + output: + file('tagcounts.txt') + + script: + """ + python3 \$STAMPIPES/scripts/bwa/bamcounts.py \ + "$bam" \ + tagcounts.txt + """ +} + +process count_adapters { + label "modules" + publishDir params.outdir + + input: + file(bam) from bam_for_adapter_counts + + output: + file('adapter.counts.txt') + + script: + """ + bash "\$STAMPIPES/scripts/bam/count_adapters.sh" "${bam}" \ + | sed 's/^/adapter\t/' \ + > adapter.counts.txt + """ +} + +process preseq { + label "modules" + publishDir params.outdir + input: + file nuclear_bam + + output: + file 'preseq.txt' + file 'preseq_targets.txt' + file 'dups.hist' + + script: + """ + python3 \$STAMPIPES/scripts/bam/mark_dups.py -i "${nuclear_bam}" -o /dev/null --hist dups.hist + preseq lc_extrap -hist dups.hist -extrap 1.001e9 -s 1e6 -v > preseq.txt \ + || preseq lc_extrap -defects -hist dups.hist -extrap 1.001e9 -s 1e6 -v > preseq.txt + + # write out preseq targets + bash "\$STAMPIPES/scripts/utility/preseq_targets.sh" preseq.txt preseq_targets.txt + """ +} + +process cutcounts { + label "modules" + + publishDir params.outdir + + input: + file(fai) from file("${params.genome}.fai") + file(filtered_bam) from bam_for_cutcounts + + output: + file('fragments.starch') + file('cutcounts.starch') + file('cutcounts.bw') + file('cutcounts.bed.bgz') + file('cutcounts.bed.bgz.tbi') + + script: + """ + bam2bed --do-not-sort \ + < "$filtered_bam" \ + | awk -v cutfile=cuts.bed -v fragmentfile=fragments.bed -f \$STAMPIPES/scripts/bwa/aggregate/basic/cutatacfragments.awk + + sort-bed fragments.bed | starch - > fragments.starch + sort-bed cuts.bed | starch - > cuts.starch + + unstarch cuts.starch \ + | cut -f1-3 \ + | bedops -m - \ + | awk '{ for(i = \$2; i < \$3; i += 1) { print \$1"\t"i"\t"i + 1 }}' \ + > allbase.tmp + + unstarch cuts.starch \ + | bedmap --echo --count --delim "\t" allbase.tmp - \ + | awk '{print \$1"\t"\$2"\t"\$3"\tid-"NR"\t"\$4}' \ + | starch - > cutcounts.starch + + # Bigwig + "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ + cutcounts.starch \ + cutcounts.bw \ + "${fai}" + + # tabix + unstarch cutcounts.starch | bgzip > cutcounts.bed.bgz + tabix -p bed cutcounts.bed.bgz + """ +} + +process density { + label "modules" + + publishDir params.outdir + + input: + file filtered_bam from bam_for_density + file chrom_bucket from file(params.chrom_bucket) + file fai from file("${params.genome}.fai") + + output: + file 'density.starch' + file 'density.bw' + file 'density.bgz' + file 'density.bgz.tbi' + set(file(filtered_bam), file('density.starch')) into to_normalize + + shell: + window_size = 75 + bin_size = 20 + scale = 1_000_000 + ''' + mkfifo density.bed + + bam2bed -d \ + < "!{filtered_bam}" \ + | cut -f1-6 \ + | awk '{ if( $6=="+" ){ s=$2; e=$2+1 } else { s=$3-1; e=$3 } print $1 "\t" s "\t" e "\tid\t" 1 }' \ + | sort-bed - \ + > density.bed \ + & + + unstarch "!{chrom_bucket}" \ + | bedmap --faster --echo --count --delim "\t" - density.bed \ + | awk -v "binI=!{bin_size}" -v "win=!{window_size}" \ + 'BEGIN{ halfBin=binI/2; shiftFactor=win-halfBin } { print $1 "\t" $2 + shiftFactor "\t" $3-shiftFactor "\tid\t" i $4}' \ + | starch - \ + > density.starch + + # Bigwig + "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ + density.starch \ + density.bw \ + "!{fai}" \ + "!{bin_size}" + + # Tabix + unstarch density.starch | bgzip > density.bgz + tabix -p bed density.bgz + ''' + +} + +process normalize_density { + label "modules" + publishDir params.outdir + + input: + set(file(filtered_bam), file(density)) from to_normalize + file(fai) from file("${params.genome}.fai") + + output: + file 'normalized.density.starch' + file 'normalized.density.bw' + file 'normalized.density.bgz' + file 'normalized.density.bgz.tbi' + + shell: + bin_size = 20 + scale = 1_000_000 + ''' + # Normalized density + unstarch density.starch \ + | awk -v allcounts=$(samtools view -c !{filtered_bam}) \ + -v extranuclear_counts=$(samtools view -c "!{filtered_bam}" chrM chrC) \ + -v scale=!{scale} \ + 'BEGIN{ tagcount=allcounts-extranuclear_counts } + { z=$5; + n=(z/tagcount)*scale; + print $1 "\t" $2 "\t" $3 "\t" $4 "\t" n }' \ + | starch - > normalized.density.starch + + "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ + normalized.density.starch \ + normalized.density.bw \ + "!{fai}" \ + "!{bin_size}" + + unstarch normalized.density.starch | bgzip > normalized.density.bgz + tabix -p bed normalized.density.bgz + ''' + +} + +process insert_sizes { + label "modules" + + publishDir params.outdir + + input: + file nuclear_bam from bam_for_inserts + file nuclear_chroms from file("${params.genome}.nuclear.txt") + + output: + file 'CollectInsertSizeMetrics.picard*' + + script: + """ + picard CollectInsertSizeMetrics \ + "INPUT=${nuclear_bam}" \ + OUTPUT=CollectInsertSizeMetrics.picard \ + HISTOGRAM_FILE=CollectInsertSizeMetrics.picard.pdf \ + VALIDATION_STRINGENCY=LENIENT \ + ASSUME_SORTED=true + + cat CollectInsertSizeMetrics.picard \ + | awk '/## HISTOGRAM/{x=1;next}x' \ + | sed 1d \ + > hist.txt + + python3 "\$STAMPIPES/scripts/utility/picard_inserts_process.py" hist.txt > CollectInsertSizeMetrics.picard.info + """ +} + +process motif_matrix { + label "modules" + + publishDir params.outdir + + input: + file hotspot_calls + file fimo_transfac from file("${dataDir}/motifs/${genome_name}.fimo.starch") + file fimo_names from file("${dataDir}/motifs/${genome_name}.fimo.transfac.names.txt") + + output: + file 'hs_motifs*.txt' + + when: + params.domotifs + + script: + """ + # create sparse motifs + bedmap --echo --echo-map-id --fraction-map 1 --delim '\t' "${hotspot_calls}" "${fimo_transfac}" > temp.bedmap.txt + python \$STAMPIPES/scripts/bwa/aggregate/basic/sparse_motifs.py "${fimo_names}" temp.bedmap.txt + """ +} + +process closest_features { + label "modules" + + publishDir params.outdir + + input: + file hotspot_calls + file transcript_starts from file("${dataDir}/features/${genome_name}.CombinedTxStarts.bed") + val thresholds from "0 1000 2500 5000 10000" + + when: + params.dofeatures + + output: + file 'prox_dist.info' + + script: + """ + closest-features \ + --dist \ + --delim '\t' \ + --closest \ + "${hotspot_calls}" \ + "${transcript_starts}" \ + > closest.txt + cat closest.txt \ + | grep -v "NA\$" \ + | awk -F"\t" '{print \$NF}' \ + | sed -e 's/-//g' \ + > closest.clean.txt + + for t in $thresholds ; do + awk \ + -v t=\$t \ + '\$1 > t {sum+=1} END {print "percent-proximal-" t "bp " sum/NR}' \ + closest.clean.txt \ + >> prox_dist.info + done + """ +} + +/* + * Metrics: Hotspot differential index comparison + */ +process differential_hotspots { + label "modules" + + publishDir params.outdir + + input: + file bam from bam_for_diff_peaks + file peaks from onepercent_peaks + file index from file(params.hotspot_index) + + output: + file 'differential_index_report.tsv' + + when: + params.hotspot_index != "." + + shell: + version = (new File(params.hotspot_index)).getAbsoluteFile().getParentFile().getName() + diffName = "dhsindex_${version}_differential_peaks" + diffPerName = "dhsindex_${version}_differential_peaks_percent" + conName = "dhsindex_${version}_constitutive_peaks" + conPerName = "dhsindex_${version}_constitutive_peaks_percent" + + ''' + set -e -o pipefail + statOverlap=$(bedops -e 1 "!{peaks}" "!{index}" | wc -l) + statNoOverlap=$(bedops -n 1 "!{peaks}" "!{index}" | wc -l) + total=$(unstarch "!{peaks}" | wc -l) + statPercOverlap=$(echo "scale=3; $statOverlap * 100.0/$total" | bc -q) + statPercNoOverlap=$(echo "scale=3; $statNoOverlap * 100.0/$total" | bc -q) + + { + echo -e "!{diffName}\t$statNoOverlap" + echo -e "!{diffPerName}\t$statPercNoOverlap" + echo -e "!{conName}\t$statOverlap" + echo -e "!{conPerName}\t$statPercOverlap" + } > differential_index_report.tsv + ''' +} + + +/* + * Footprint calling + */ +process learn_dispersion { + + label "footprints" + publishDir params.outdir + + memory = '8 GB' + cpus = 8 + + when: + params.bias != "" + + input: + file ref from file("${params.genome}.fa") + file bam from bam_for_footprints + //file bai + file spots from hotspot_calls_for_bias + file bias from file(params.bias) + + output: + set file('dm.json'), file(bam), file ("${bam}.bai") into dispersion + + script: + """ + samtools index "$bam" + + # TODO: Use nuclear file + unstarch $spots \ + | grep -v "_random" \ + | grep -v "chrUn" \ + | grep -v "chrM" \ + > intervals.bed + + ftd-learn-dispersion-model \ + --bm $bias \ + --half-win-width 5 \ + --processors 8 \ + $bam \ + $ref \ + intervals.bed \ + > dm.json + """.stripIndent() + +} + +process make_intervals { + + label "footprints" + input: + file starch from hotspot_calls + + output: + file 'chunk_*' into intervals mode flatten + + script: + """ + unstarch "$starch" \ + | grep -v "_random" \ + | grep -v "chrUn" \ + | grep -v "chrM" \ + | split -l "$params.chunksize" -a 4 -d - chunk_ + """.stripIndent() + +} + +process compute_deviation { + + label "footprints" + memory = '8 GB' + cpus = 4 + + input: + set file(interval), file(dispersion), file(bam), file(bai) from intervals.combine(dispersion) + file(bias) from file(params.bias) + file(ref) from file("${params.genome}.fa") + + output: + file 'deviation.out' into deviations + + script: + """ + ftd-compute-deviation \ + --bm "$bias" \ + --half-win-width 5 \ + --smooth-half-win-width 50 \ + --smooth-clip 0.01 \ + --dm "$dispersion" \ + --fdr-shuffle-n 50 \ + --processors 4 \ + "$bam" \ + "$ref" \ + "$interval" \ + | sort --buffer-size=8G -k1,1 -k2,2n \ + > deviation.out + """.stripIndent() +} + +process merge_deviation { + + label "footprints" + memory = "32 GB" + cpus = 1 + + when: + params.bias != "" + + input: + file 'chunk_*' from deviations.collect() + + output: + file 'interval.all.bedgraph' into merged_interval + + script: + """ + echo chunk_* + sort -k1,1 -k2,2n -S 32G -m chunk_* > interval.all.bedgraph + """.stripIndent() +} + +process working_tracks { + + label "footprints" + memory = '32 GB' + cpus = 1 + + publishDir params.outdir + + input: + file merged_interval + + output: + file 'interval.all.bedgraph' into bedgraph + file 'interval.all.bedgraph.starch' + file 'interval.all.bedgraph.gz' + file 'interval.all.bedgraph.gz.tbi' + + script: + """ + sort-bed "$merged_interval" | starch - > interval.all.bedgraph.starch + bgzip -c "$merged_interval" > interval.all.bedgraph.gz + tabix -0 -p bed interval.all.bedgraph.gz + """.stripIndent() + +} + +thresholds = Channel.from(0.2, 0.1, 0.05, 0.01, 0.001, 0.0001) + +process compute_footprints { + + label "footprints" + memory = '8 GB' + cpus = 1 + + publishDir params.outdir + + input: + set file(merged_interval), val(threshold) from merged_interval.combine(thresholds) + + output: + file "interval.all.fps.${threshold}.bed.gz" + file "interval.all.fps.${threshold}.bed.gz.tbi" + + script: + """ + output=interval.all.fps.${threshold}.bed + cat "${merged_interval}" \ + | awk -v OFS="\t" -v thresh="${threshold}" '\$8 <= thresh { print \$1, \$2-3, \$3+3; }' \ + | sort-bed --max-mem 8G - \ + | bedops -m - \ + | awk -v OFS="\t" -v thresh="${threshold}" '{ \$4="."; \$5=thresh; print; }' \ + > \$output + + bgzip -c "\$output" > "\$output.gz" + tabix -0 -p bed "\$output.gz" + """.stripIndent() + +} diff --git a/scripts/bwa/aggregate/basic/cutatacfragments.awk b/scripts/bwa/aggregate/basic/cutatacfragments.awk new file mode 100644 index 00000000..2b76c161 --- /dev/null +++ b/scripts/bwa/aggregate/basic/cutatacfragments.awk @@ -0,0 +1,28 @@ +# Run with variables +# cutfile -- BED output file for cuts +# fragmentfile -- BED output file for fragments +# This differs from the normal file by modifying the read start +# and end cuts to match the ATAC protocol +BEGIN{ + FS="\t" + OFS="\t" +}{ + strand=$6; + read_start=$2; + read_end=$3; + read_id=$1; + flag=$7; + tlen=$11; + if( strand == "+" ){ + cut_start = read_start + 4; + cut_end = cut_start + 1; + } else { + cut_start= read_end - 5; + cut_end = cut_start + 1; + } + print read_id, cut_start, cut_end, "id" , "1", strand > cutfile; + if (tlen > 0) { + fragment_end = read_start + tlen; + print read_id, read_start, fragment_end, ".", "." > fragmentfile; + } +} From ca746c5cd5b2181669feb0e523c49f03b15f23fa Mon Sep 17 00:00:00 2001 From: Audra Johnson Date: Mon, 6 May 2019 06:44:33 -0700 Subject: [PATCH 2/7] Need a non-adapter trimmed version of alignment --- processes/bwa/process_bwa_paired.nf | 520 ++++++++++++++++++++++++++++ 1 file changed, 520 insertions(+) create mode 100755 processes/bwa/process_bwa_paired.nf diff --git a/processes/bwa/process_bwa_paired.nf b/processes/bwa/process_bwa_paired.nf new file mode 100755 index 00000000..d9f890bc --- /dev/null +++ b/processes/bwa/process_bwa_paired.nf @@ -0,0 +1,520 @@ +#!/usr/bin/env nextflow +/* + * This is a proof-of-concept workflow for running our DNase alignment pipeline + */ + +params.help = false +params.threads = 1 +params.chunk_size = 16000000 + +params.UMI = false +params.trim_to = 0 +params.genome = "" +params.r1 = "" +params.r2 = "" +params.outdir = "output" +params.readlength = 36 + +nuclear_chroms = "$params.genome" + ".nuclear.txt" +dataDir = "$baseDir/../../data" + +def helpMessage() { + log.info""" + Usage: nextflow run process_bwa_paired.nf \\ + --r1 r1.fastq.gz \\ + --r2 r2.fastq.gz \\ + --genome /path/to/genome \\ + Options: + --threads [count] The number of threads that will be used for child applications (1) + --chunk_size [count] How many reads to process per chunk (16000000) + --UMI The reads contain UMI markers ('single-strand', 'True') (false) + --trim_to [length] Trim fastq reads to [length] bp (0 for no trimming) (0) + --outdir [dir] Where to write results to (output) + --read_length [length] How long our reads are (36) + """.stripIndent(); +} + +if (params.help || !params.r1 || !params.r2 || !params.genome){ + helpMessage(); + exit 0; +} + +// Some renaming for easier usage later +genome = params.genome +genome_name = file(params.genome).baseName +threads = params.threads + +/* + * Step 0: Split Fastq into chunks + */ +fastq_line_chunks = 4 * params.chunk_size +process split_r1_fastq { + + input: + file(r1) from file(params.r1) + val fastq_line_chunks + + output: + file('split_r1*gz') into split_r1 mode flatten + + script: + """ + zcat $r1 \ + | split -l $fastq_line_chunks \ + --filter='gzip -1 > \$FILE.gz' - 'split_r1' + """ +} +process split_r2_fastq { + input: + file(r2) from file(params.r2) + val fastq_line_chunks + + output: + file('split_r2*gz') into split_r2 mode flatten + + script: + """ + zcat $r2 \ + | split -l $fastq_line_chunks \ + --filter='gzip -1 > \$FILE.gz' - 'split_r2' + """ +} + +/* + * Step 1: Trim to the appropriate length + */ +process trim_to_length { + + input: + file split_r1 + file split_r2 + + output: + set file('r1.trim.fastq.gz'), file('r2.trim.fastq.gz') into trimmed_fastq + + script: + if (params.trim_to != 0) + // TODO: Add padding to length with N's + """ + zcat $split_r1 | awk 'NR%2==0 {print substr(\$0, 1, $params.trim_to)} NR%2!=0' | gzip -c -1 > r1.trim.fastq.gz + zcat $split_r2 | awk 'NR%2==0 {print substr(\$0, 1, $params.trim_to)} NR%2!=0' | gzip -c -1 > r2.trim.fastq.gz + """ + else + """ + ln -s $split_r1 r1.trim.fastq.gz + ln -s $split_r2 r2.trim.fastq.gz + """ + +} + +process add_umi_info { + + input: + set file(r1), file(r2) from trimmed_fastq + + output: + set file('r1.fastq.umi.gz'), file('r2.fastq.umi.gz') into with_umi + + script: + if (params.UMI == 'thruplex') + """ + python3 \$STAMPIPES/scripts/umi/extract_umt.py \ + <(zcat $r1) \ + <(zcat $r2) \ + >(gzip -c -1 > r1.fastq.umi.gz) \ + >(gzip -c -1 > r2.fastq.umi.gz) + """ + else if (params.UMI == 'True') // TODO: Is this right? + """ + python3 \$STAMPIPES/scripts/umi/fastq_umi_add.py $r1 r1.fastq.umi.gz + python3 \$STAMPIPES/scripts/umi/fastq_umi_add.py $r2 r2.fastq.umi.gz + """ + + else if (params.UMI == false || params.UMI == "") + """ + ln -s $r1 r1.fastq.umi.gz + ln -s $r2 r2.fastq.umi.gz + """ + else + error "--UMI must be `thruplex`, `True` (for single-strand preparation), or false" +} + +/* + * Metrics: Fastq counts + */ +process fastq_counts { + + input: + file(r1) from file(params.r1) + file(r2) from file(params.r2) + + output: + file 'fastq.counts' into fastq_counts + + script: + """ + zcat $r1 \ + | awk -v paired=1 -f \$STAMPIPES/awk/illumina_fastq_count.awk \ + > fastq.counts + """ +} + +/* + * Step 2a: Create alignment files + */ +process align { + + cpus params.threads + + input: + set file(trimmed_r1), file(trimmed_r2) from with_umi + + file genome from file(params.genome) + file '*' from file("${params.genome}.amb") + file '*' from file("${params.genome}.ann") + file '*' from file("${params.genome}.bwt") + file '*' from file("${params.genome}.fai") + file '*' from file("${params.genome}.pac") + file '*' from file("${params.genome}.sa") + + + output: + file 'out.bam' into unfiltered_bam + + script: + """ + ls + bwa aln \ + -Y -l 32 -n 0.04 \ + -t "${params.threads}" \ + "$genome" \ + "$trimmed_r1" \ + > out1.sai + + bwa aln \ + -Y -l 32 -n 0.04 \ + -t "$params.threads" \ + "$genome" \ + "$trimmed_r2" \ + > out2.sai + + bwa sampe \ + -n 10 -a 750 \ + "$genome" \ + out1.sai out2.sai \ + "$trimmed_r1" "$trimmed_r2" \ + | samtools view -b -t "$genome".fai - \ + > out.bam + """ + +} + +/* + * Step 2b: filter bam files to have only good reads + */ +process filter_bam { + + input: + file unfiltered_bam + file nuclear_chroms from file(nuclear_chroms) + + output: + file 'filtered.bam' into filtered_bam + + script: + """ + python3 \$STAMPIPES/scripts/bwa/filter_reads.py \ + "$unfiltered_bam" \ + filtered.bam \ + "$nuclear_chroms" + """ +} + +/* + * Step 2c: sort bam files + */ +process sort_bam { + + cpus params.threads + + input: + file filtered_bam + + output: + file 'sorted.bam' into sorted_bam + + script: + """ + samtools sort \ + -l 0 -m 1G -@ "${params.threads}" "$filtered_bam" \ + > sorted.bam + """ +} + +/* + * Step 3: Merge alignments into one big ol' file + */ +process merge_bam { + input: + file 'sorted_bam_*' from sorted_bam.collect() + + output: + file 'merged.bam' into merged_bam + + script: + """ + samtools merge merged.bam sorted_bam* + samtools index merged.bam + """ +} + +/* + * Step 4: Mark duplicates with Picard + */ +process mark_duplicates { + + label "high_mem" + + publishDir params.outdir + + input: + file(merged_bam) from merged_bam + + output: + file 'marked.bam' into marked_bam + file 'marked.bam' into marked_bam_for_counts + file 'MarkDuplicates.picard' + + + script: + if (params.UMI) + """ + picard RevertOriginalBaseQualitiesAndAddMateCigar \ + INPUT=$merged_bam OUTPUT=cigar.bam \ + VALIDATION_STRINGENCY=SILENT RESTORE_ORIGINAL_QUALITIES=false SORT_ORDER=coordinate MAX_RECORDS_TO_EXAMINE=0 + + picard UmiAwareMarkDuplicatesWithMateCigar INPUT=cigar.bam OUTPUT=marked.bam \ + METRICS_FILE=MarkDuplicates.picard UMI_TAG_NAME=XD ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT \ + READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' \ + MINIMUM_DISTANCE=300 + """ + else + """ + picard RevertOriginalBaseQualitiesAndAddMateCigar \ + INPUT=$merged_bam OUTPUT=cigar.bam \ + VALIDATION_STRINGENCY=SILENT RESTORE_ORIGINAL_QUALITIES=false SORT_ORDER=coordinate MAX_RECORDS_TO_EXAMINE=0 + + picard MarkDuplicatesWithMateCigar INPUT=cigar.bam OUTPUT=marked.bam \ + METRICS_FILE=MarkDuplicates.picard ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT \ + READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' \ + MINIMUM_DISTANCE=300 + """ +} + +/* + * Step 5: Filter bam file + */ +filter_flag = 512 +if (params.UMI) + filter_flag = 1536 + +process filter_bam_to_unique { + + publishDir params.outdir + + input: + file marked_bam + + output: + set file('filtered.bam'), file('filtered.bam.bai') into uniquely_mapping_bam + + script: + """ + samtools view $marked_bam -b -F $filter_flag > filtered.bam + samtools index filtered.bam + """ + +} + +// Marked bam file is used by several downstream results +uniquely_mapping_bam.into { bam_for_insert; bam_for_spot; bam_for_density } + +/* + * Metrics: bam counts + */ +process bam_counts { + + + input: + file(sorted_bam) from marked_bam_for_counts + + output: + file('bam.counts.txt') into bam_counts + + script: + """ + python3 \$STAMPIPES/scripts/bwa/bamcounts.py \ + "$sorted_bam" \ + bam.counts.txt + """ +} + +/* + * Metrics: Insert size + */ +process insert_size { + + publishDir params.outdir + + input: + set file(bam), file(bai) from bam_for_insert + + output: + file 'CollectInsertSizeMetrics.picard' + file 'CollectInsertSizeMetrics.picard.pdf' + + script: + """ + samtools idxstats "$bam" \ + | cut -f 1 \ + | grep -v chrM \ + | grep -v chrC \ + | xargs samtools view -b "$bam" \ + > nuclear.bam + + picard CollectInsertSizeMetrics \ + INPUT=nuclear.bam \ + OUTPUT=CollectInsertSizeMetrics.picard \ + HISTOGRAM_FILE=CollectInsertSizeMetrics.picard.pdf \ + VALIDATION_STRINGENCY=LENIENT \ + ASSUME_SORTED=true + """ +} + +/* + * Metrics: SPOT score + */ + +process spot_score { + + publishDir params.outdir + + input: + set file(bam), file(bai) from bam_for_spot + file('*') from file("${dataDir}/annotations/${genome_name}.K${params.readlength}.mappable_only.bed") + file('*') from file("${dataDir}/annotations/${genome_name}.chromInfo.bed") + + output: + file 'subsample.r1.spot.out' + file 'spotdups.txt' + + script: + """ + # random sample + samtools view -h -F 12 -f 3 "$bam" \ + | awk '{if( ! index(\$3, "chrM") && \$3 != "chrC" && \$3 != "random"){print}}' \ + | samtools view -1 - \ + -o paired.bam + bash \$STAMPIPES/scripts/bam/random_sample.sh paired.bam subsample.bam 5000000 + samtools view -1 -f 0x0040 subsample.bam -o subsample.r1.bam + + # hotspot + bash \$STAMPIPES/scripts/SPOT/runhotspot.bash \ + \$HOTSPOT_DIR \ + \$PWD \ + \$PWD/subsample.r1.bam \ + "${genome_name}" \ + "${params.readlength}" \ + DNaseI + + # Remove existing duplication marks + picard RevertSam \ + INPUT=subsample.bam \ + OUTPUT=clear.bam \ + VALIDATION_STRINGENCY=SILENT REMOVE_DUPLICATE_INFORMATION=true SORT_ORDER=coordinate \ + RESTORE_ORIGINAL_QUALITIES=false REMOVE_ALIGNMENT_INFORMATION=false + + picard MarkDuplicatesWithMateCigar \ + INPUT=clear.bam \ + METRICS_FILE=spotdups.txt \ + OUTPUT=/dev/null \ + ASSUME_SORTED=true \ + MINIMUM_DISTANCE=300 \ + VALIDATION_STRINGENCY=SILENT \ + READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' + """ +} + +/* + * Density tracks (for browser) + */ +win = 75 +bini = 20 +process density_files { + + publishDir params.outdir + + input: + set file(bam), file(bai) from bam_for_density + file fai from file("${params.genome}.fai") + file density_buckets from file( + "$baseDir/../../data/densities/chrom-buckets.${genome_name}.${win}_${bini}.bed.starch" + ) + + output: + file 'density.bed.starch' + file 'density.bw' + file 'density.bed.bgz' + + + script: + """ + bam2bed -d \ + < $bam \ + | cut -f1-6 \ + | awk '{ if( \$6=="+" ){ s=\$2; e=\$2+1 } else { s=\$3-1; e=\$3 } print \$1 "\t" s "\t" e "\tid\t" 1 }' \ + | sort-bed - \ + > sample.bed + + unstarch "${density_buckets}" \ + | bedmap --faster --echo --count --delim "\t" - sample.bed \ + | awk -v binI=$bini -v win=$win \ + 'BEGIN{ halfBin=binI/2; shiftFactor=win-halfBin } { print \$1 "\t" \$2 + shiftFactor "\t" \$3-shiftFactor "\tid\t" i \$4}' \ + | starch - \ + > density.bed.starch + + unstarch density.bed.starch | awk -v binI=$bini -f \$STAMPIPES/awk/bedToWig.awk > density.wig + + wigToBigWig -clip density.wig "${fai}" density.bw + + + unstarch density.bed.starch | bgzip > density.bed.bgz + tabix -p bed density.bed.bgz + """ +} + +/* + * Metrics: total counts + */ +process total_counts { + + publishDir params.outdir + + input: + file 'fastqcounts*' from fastq_counts.collect() + file 'bamcounts*' from bam_counts.collect() + + output: + file 'tagcounts.txt' + + script: + """ + cat *counts* \ + | awk ' + { x[\$1] += \$2 } + END {for (i in x) print i "\t" x[i]} + ' \ + | sort -k 1,1 \ + > tagcounts.txt + """ +} From b928bf54e729a0259daa83291fec14170f9f4d78 Mon Sep 17 00:00:00 2001 From: Audra Johnson Date: Wed, 3 Jul 2019 12:06:49 -0700 Subject: [PATCH 3/7] Remove unneeded processes from ATAC nextflow --- processes/bwa/aggregate/atac.nf | 289 -------------------------------- 1 file changed, 289 deletions(-) diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf index 42f26b6a..02923d22 100644 --- a/processes/bwa/aggregate/atac.nf +++ b/processes/bwa/aggregate/atac.nf @@ -448,292 +448,3 @@ process insert_sizes { python3 "\$STAMPIPES/scripts/utility/picard_inserts_process.py" hist.txt > CollectInsertSizeMetrics.picard.info """ } - -process motif_matrix { - label "modules" - - publishDir params.outdir - - input: - file hotspot_calls - file fimo_transfac from file("${dataDir}/motifs/${genome_name}.fimo.starch") - file fimo_names from file("${dataDir}/motifs/${genome_name}.fimo.transfac.names.txt") - - output: - file 'hs_motifs*.txt' - - when: - params.domotifs - - script: - """ - # create sparse motifs - bedmap --echo --echo-map-id --fraction-map 1 --delim '\t' "${hotspot_calls}" "${fimo_transfac}" > temp.bedmap.txt - python \$STAMPIPES/scripts/bwa/aggregate/basic/sparse_motifs.py "${fimo_names}" temp.bedmap.txt - """ -} - -process closest_features { - label "modules" - - publishDir params.outdir - - input: - file hotspot_calls - file transcript_starts from file("${dataDir}/features/${genome_name}.CombinedTxStarts.bed") - val thresholds from "0 1000 2500 5000 10000" - - when: - params.dofeatures - - output: - file 'prox_dist.info' - - script: - """ - closest-features \ - --dist \ - --delim '\t' \ - --closest \ - "${hotspot_calls}" \ - "${transcript_starts}" \ - > closest.txt - cat closest.txt \ - | grep -v "NA\$" \ - | awk -F"\t" '{print \$NF}' \ - | sed -e 's/-//g' \ - > closest.clean.txt - - for t in $thresholds ; do - awk \ - -v t=\$t \ - '\$1 > t {sum+=1} END {print "percent-proximal-" t "bp " sum/NR}' \ - closest.clean.txt \ - >> prox_dist.info - done - """ -} - -/* - * Metrics: Hotspot differential index comparison - */ -process differential_hotspots { - label "modules" - - publishDir params.outdir - - input: - file bam from bam_for_diff_peaks - file peaks from onepercent_peaks - file index from file(params.hotspot_index) - - output: - file 'differential_index_report.tsv' - - when: - params.hotspot_index != "." - - shell: - version = (new File(params.hotspot_index)).getAbsoluteFile().getParentFile().getName() - diffName = "dhsindex_${version}_differential_peaks" - diffPerName = "dhsindex_${version}_differential_peaks_percent" - conName = "dhsindex_${version}_constitutive_peaks" - conPerName = "dhsindex_${version}_constitutive_peaks_percent" - - ''' - set -e -o pipefail - statOverlap=$(bedops -e 1 "!{peaks}" "!{index}" | wc -l) - statNoOverlap=$(bedops -n 1 "!{peaks}" "!{index}" | wc -l) - total=$(unstarch "!{peaks}" | wc -l) - statPercOverlap=$(echo "scale=3; $statOverlap * 100.0/$total" | bc -q) - statPercNoOverlap=$(echo "scale=3; $statNoOverlap * 100.0/$total" | bc -q) - - { - echo -e "!{diffName}\t$statNoOverlap" - echo -e "!{diffPerName}\t$statPercNoOverlap" - echo -e "!{conName}\t$statOverlap" - echo -e "!{conPerName}\t$statPercOverlap" - } > differential_index_report.tsv - ''' -} - - -/* - * Footprint calling - */ -process learn_dispersion { - - label "footprints" - publishDir params.outdir - - memory = '8 GB' - cpus = 8 - - when: - params.bias != "" - - input: - file ref from file("${params.genome}.fa") - file bam from bam_for_footprints - //file bai - file spots from hotspot_calls_for_bias - file bias from file(params.bias) - - output: - set file('dm.json'), file(bam), file ("${bam}.bai") into dispersion - - script: - """ - samtools index "$bam" - - # TODO: Use nuclear file - unstarch $spots \ - | grep -v "_random" \ - | grep -v "chrUn" \ - | grep -v "chrM" \ - > intervals.bed - - ftd-learn-dispersion-model \ - --bm $bias \ - --half-win-width 5 \ - --processors 8 \ - $bam \ - $ref \ - intervals.bed \ - > dm.json - """.stripIndent() - -} - -process make_intervals { - - label "footprints" - input: - file starch from hotspot_calls - - output: - file 'chunk_*' into intervals mode flatten - - script: - """ - unstarch "$starch" \ - | grep -v "_random" \ - | grep -v "chrUn" \ - | grep -v "chrM" \ - | split -l "$params.chunksize" -a 4 -d - chunk_ - """.stripIndent() - -} - -process compute_deviation { - - label "footprints" - memory = '8 GB' - cpus = 4 - - input: - set file(interval), file(dispersion), file(bam), file(bai) from intervals.combine(dispersion) - file(bias) from file(params.bias) - file(ref) from file("${params.genome}.fa") - - output: - file 'deviation.out' into deviations - - script: - """ - ftd-compute-deviation \ - --bm "$bias" \ - --half-win-width 5 \ - --smooth-half-win-width 50 \ - --smooth-clip 0.01 \ - --dm "$dispersion" \ - --fdr-shuffle-n 50 \ - --processors 4 \ - "$bam" \ - "$ref" \ - "$interval" \ - | sort --buffer-size=8G -k1,1 -k2,2n \ - > deviation.out - """.stripIndent() -} - -process merge_deviation { - - label "footprints" - memory = "32 GB" - cpus = 1 - - when: - params.bias != "" - - input: - file 'chunk_*' from deviations.collect() - - output: - file 'interval.all.bedgraph' into merged_interval - - script: - """ - echo chunk_* - sort -k1,1 -k2,2n -S 32G -m chunk_* > interval.all.bedgraph - """.stripIndent() -} - -process working_tracks { - - label "footprints" - memory = '32 GB' - cpus = 1 - - publishDir params.outdir - - input: - file merged_interval - - output: - file 'interval.all.bedgraph' into bedgraph - file 'interval.all.bedgraph.starch' - file 'interval.all.bedgraph.gz' - file 'interval.all.bedgraph.gz.tbi' - - script: - """ - sort-bed "$merged_interval" | starch - > interval.all.bedgraph.starch - bgzip -c "$merged_interval" > interval.all.bedgraph.gz - tabix -0 -p bed interval.all.bedgraph.gz - """.stripIndent() - -} - -thresholds = Channel.from(0.2, 0.1, 0.05, 0.01, 0.001, 0.0001) - -process compute_footprints { - - label "footprints" - memory = '8 GB' - cpus = 1 - - publishDir params.outdir - - input: - set file(merged_interval), val(threshold) from merged_interval.combine(thresholds) - - output: - file "interval.all.fps.${threshold}.bed.gz" - file "interval.all.fps.${threshold}.bed.gz.tbi" - - script: - """ - output=interval.all.fps.${threshold}.bed - cat "${merged_interval}" \ - | awk -v OFS="\t" -v thresh="${threshold}" '\$8 <= thresh { print \$1, \$2-3, \$3+3; }' \ - | sort-bed --max-mem 8G - \ - | bedops -m - \ - | awk -v OFS="\t" -v thresh="${threshold}" '{ \$4="."; \$5=thresh; print; }' \ - > \$output - - bgzip -c "\$output" > "\$output.gz" - tabix -0 -p bed "\$output.gz" - """.stripIndent() - -} From 6d41982a0b034cd1d333483087f44e5ab5614baf Mon Sep 17 00:00:00 2001 From: Audra Johnson Date: Mon, 8 Jul 2019 22:24:32 -0700 Subject: [PATCH 4/7] intermediate tracking commit to use to compare to new changes --- processes/bwa/aggregate/atac.nf | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf index 02923d22..69d2ed13 100644 --- a/processes/bwa/aggregate/atac.nf +++ b/processes/bwa/aggregate/atac.nf @@ -107,7 +107,32 @@ process filter { samtools view -b -F "${flag}" marked.bam > filtered.bam """ } -filtered_bam.into { bam_for_hotspot2; bam_for_spot_score; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprints} +filtered_bam.into { bam_for_shift; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprints } + +/* +https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3959825/ +For peak calling and footprinting, we adjusted the read start sites to represent the center of the transposon binding event. +Previous descriptions of the Tn5 transposase show that the transposon binds as a dimer and inserts two adapters separated by +9 bps (main text ref. 11). Therefore, all reads aligning to the + strand were offset by +4 bps, and all reads +aligning to the – strand were offset −5 bps. +*/ +process shift { + label "modules" + + publishDir params.outdir + + input: + file filtered_bam from bam_for_shift + + output: + file "shifted.bam" into shifted_bam + + script: + """ + python3 \$STAMPIPES/scripts/bwa/aggregate/basic/shiftatacbam.py < ${filtered_bam} > shifted.bam + """ +} +shifted_bam.into { bam_for_hotspot2; bam_for_spot_score } process filter_nuclear { label "modules" From 88d82f11f221dd83fbdefde3ab22cc72c1714096 Mon Sep 17 00:00:00 2001 From: Audra Johnson Date: Thu, 11 Jul 2019 16:37:11 -0700 Subject: [PATCH 5/7] (fix) Update ATAC process with latest aggregation process --- processes/bwa/aggregate/atac.nf | 87 +++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 20 deletions(-) diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf index 69d2ed13..d575fa17 100644 --- a/processes/bwa/aggregate/atac.nf +++ b/processes/bwa/aggregate/atac.nf @@ -9,11 +9,15 @@ params.dofeatures = false params.readlength = 36 -params.hotspot_index = "." +params.peakcaller = "hotspot2" params.bias = "" params.chunksize = 5000 +params.hotspot_id = "default" +params.hotspot_index = "." + + def helpMessage() { log.info""" Usage: nextflow run basic.nf \\ @@ -56,7 +60,7 @@ process merge { process dups { label "modules" publishDir params.outdir - memory '16 GB' + label 'high_mem' input: file(merged) @@ -107,7 +111,7 @@ process filter { samtools view -b -F "${flag}" marked.bam > filtered.bam """ } -filtered_bam.into { bam_for_shift; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprints } +filtered_bam.into { bam_for_shift; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprint; } /* https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3959825/ @@ -119,20 +123,33 @@ aligning to the – strand were offset −5 bps. process shift { label "modules" + cpus params.threads publishDir params.outdir input: file filtered_bam from bam_for_shift + file nuclear_chroms from file("${params.genome}.nuclear.txt") output: file "shifted.bam" into shifted_bam + file "nuclear_shifted.bam" into nuclear_shifted_bam script: """ - python3 \$STAMPIPES/scripts/bwa/aggregate/basic/shiftatacbam.py < ${filtered_bam} > shifted.bam + python3 \$STAMPIPES/scripts/bwa/aggregate/basic/shiftatacbam.py < ${filtered_bam} > unsorted_shifted.bam + # we have to resort after our shifting + samtools sort \ + -l 0 -m 1G -@ "${params.threads}" unsorted_shifted.bam \ + > shifted.bam + samtools index shifted.bam + # we also need a nuclear version + cat "${nuclear_chroms}" \ + | xargs samtools view -b shifted.bam \ + > nuclear_shifted.bam + samtools index nuclear_shifted.bam """ } -shifted_bam.into { bam_for_hotspot2; bam_for_spot_score } +nuclear_shifted_bam.into { bam_for_hotspot2; bam_for_macs2; bam_for_spot_score; } process filter_nuclear { label "modules" @@ -152,51 +169,80 @@ process filter_nuclear { """ } +process macs2 { + label "macs2" + publishDir "${params.outdir}/peaks_macs2" + scratch false + + when: + params.peakcaller == "macs2" + + input: + file bam from bam_for_macs2 + + output: + file 'NA_*' + + script: + """ + macs2 callpeak \ + -t "$bam" \ + -f BAMPE \ + -g hs \ + -B \ + -q 0.01 + """ +} + process hotspot2 { label "modules" publishDir "${params.outdir}" container "fwip/hotspot2:latest" + when: + params.peakcaller == "hotspot2" + input: - file(marked_bam) from bam_for_hotspot2 + val(hotspotid) from params.hotspot_id + file(nuclear) from bam_for_hotspot2 file(mappable) from file(params.mappable) file(chrom_sizes) from file(params.chrom_sizes) file(centers) from file(params.centers) output: - file('peaks/filtered*') - file('peaks/filtered.hotspots.fdr0.05.starch') into hotspot_calls - file('peaks/filtered.hotspots.fdr0.05.starch') into hotspot_calls_for_bias - file('peaks/filtered.peaks.fdr0.001.starch') into onepercent_peaks + file('peaks/nuclear_shifted*') + file('peaks/nuclear_shifted.hotspots.fdr0.05.starch') into hotspot_calls + file('peaks/nuclear_shifted.hotspots.fdr0.05.starch') into hotspot_calls_for_bias + file('peaks/nuclear_shifted.peaks.fdr0.001.starch') into onepercent_peaks script: """ export TMPDIR=\$(mktemp -d) - hotspot2.sh -F 0.5 -p varWidth_20_default \ + hotspot2.sh -F 0.5 -p "varWidth_20_${hotspotid}" \ -M "${mappable}" \ -c "${chrom_sizes}" \ -C "${centers}" \ - "${marked_bam}" \ + "${nuclear}" \ 'peaks' cd peaks # Rename peaks files to include FDR - mv filtered.peaks.narrowpeaks.starch filtered.peaks.narrowpeaks.fdr0.05.starch - mv filtered.peaks.starch filtered.peaks.fdr0.05.starch + mv nuclear_shifted.peaks.narrowpeaks.starch nuclear_shifted.peaks.narrowpeaks.fdr0.05.starch + mv nuclear_shifted.peaks.starch nuclear_shifted.peaks.fdr0.05.starch bash \$STAMPIPES/scripts/SPOT/info.sh \ - filtered.hotspots.fdr0.05.starch hotspot2 filtered.SPOT.txt \ - > filtered.hotspot2.info + nuclear_shifted.hotspots.fdr0.05.starch hotspot2 nuclear_shifted.SPOT.txt \ + > nuclear_shifted.hotspot2.info # TODO: Move this to separate process - hsmerge.sh -f 0.01 filtered.allcalls.starch filtered.hotspots.fdr0.01.starch - hsmerge.sh -f 0.001 filtered.allcalls.starch filtered.hotspots.fdr0.001.starch + hsmerge.sh -f 0.01 nuclear_shifted.allcalls.starch nuclear_shifted.hotspots.fdr0.01.starch + hsmerge.sh -f 0.001 nuclear_shifted.allcalls.starch nuclear_shifted.hotspots.fdr0.001.starch - density-peaks.bash \$TMPDIR varWidth_20_default filtered.cutcounts.starch filtered.hotspots.fdr0.01.starch ../"${chrom_sizes}" filtered.density.starch filtered.peaks.fdr0.01.starch \$(cat filtered.cleavage.total) - density-peaks.bash \$TMPDIR varWidth_20_default filtered.cutcounts.starch filtered.hotspots.fdr0.001.starch ../"${chrom_sizes}" filtered.density.starch filtered.peaks.fdr0.001.starch \$(cat filtered.cleavage.total) + density-peaks.bash \$TMPDIR "varWidth_20_${hotspotid}" nuclear_shifted.cutcounts.starch nuclear_shifted.hotspots.fdr0.01.starch ../"${chrom_sizes}" nuclear_shifted.density.starch nuclear_shifted.peaks.fdr0.01.starch \$(cat nuclear_shifted.cleavage.total) + density-peaks.bash \$TMPDIR "varWidth_20_${hotspotid}" nuclear_shifted.cutcounts.starch nuclear_shifted.hotspots.fdr0.001.starch ../"${chrom_sizes}" nuclear_shifted.density.starch nuclear_shifted.peaks.fdr0.001.starch \$(cat nuclear_shifted.cleavage.total) rm -rf "\$TMPDIR" """ @@ -354,6 +400,7 @@ process density { label "modules" publishDir params.outdir + label 'high_mem' input: file filtered_bam from bam_for_density From a107634db37784a0c749101a258cb8c83b42c5c7 Mon Sep 17 00:00:00 2001 From: Audra Johnson Date: Fri, 12 Jul 2019 14:22:55 -0700 Subject: [PATCH 6/7] (fix) Add SPOT dup score to aggregate process --- processes/bwa/aggregate/atac.nf | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf index d575fa17..c4832444 100644 --- a/processes/bwa/aggregate/atac.nf +++ b/processes/bwa/aggregate/atac.nf @@ -261,6 +261,7 @@ process spot_score { output: file 'r1.spot.out' file 'r1.hotspot.info' + file 'spotdups.picard' script: """ @@ -287,6 +288,31 @@ process spot_score { bash \$STAMPIPES/scripts/SPOT/info.sh \ r1.spots.starch hotspot1 r1.spot.out \ > r1.hotspot.info + + # Create the SPOT dup score + # RevertSam will barf if it's not sorted exactly how it likes apparently? + # Exception in thread "main" java.lang.IllegalArgumentException: Alignments added + # out of order in SAMFileWriterImpl.addAlignment for /tmp/nxf.ZbudE8UGBY/clear.bam. + # Sort order is coordinate. Offending records are at [chr1:195371075] and [chr10:3100064] + picard SortSam \ + INPUT=subsample.bam \ + OUTPUT=sorted_subsample.bam \ + SORT_ORDER=coordinate + # Remove existing duplication marks + picard RevertSam \ + INPUT=sorted_subsample.bam \ + OUTPUT=clear.bam \ + VALIDATION_STRINGENCY=SILENT REMOVE_DUPLICATE_INFORMATION=true SORT_ORDER=coordinate \ + RESTORE_ORIGINAL_QUALITIES=false REMOVE_ALIGNMENT_INFORMATION=false + # Make the dup scare + picard MarkDuplicatesWithMateCigar \ + INPUT=clear.bam \ + METRICS_FILE=spotdups.picard \ + OUTPUT=/dev/null \ + ASSUME_SORTED=true \ + MINIMUM_DISTANCE=300 \ + VALIDATION_STRINGENCY=SILENT \ + READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' """ } From aad095b387560b831dfb6e15a754fd3a8bf1094b Mon Sep 17 00:00:00 2001 From: Audra Johnson Date: Wed, 17 Jul 2019 08:11:37 -0700 Subject: [PATCH 7/7] Add shift bam python process --- scripts/bwa/aggregate/basic/shiftatacbam.py | 47 +++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 scripts/bwa/aggregate/basic/shiftatacbam.py diff --git a/scripts/bwa/aggregate/basic/shiftatacbam.py b/scripts/bwa/aggregate/basic/shiftatacbam.py new file mode 100644 index 00000000..7a5de163 --- /dev/null +++ b/scripts/bwa/aggregate/basic/shiftatacbam.py @@ -0,0 +1,47 @@ +""" +https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3959825/ +For peak calling and footprinting, we adjusted the read start sites to represent the center of the transposon binding event. +Previous descriptions of the Tn5 transposase show that the transposon binds as a dimer and inserts two adapters separated by +9 bps (main text ref. 11). Therefore, all reads aligning to the + strand were offset by +4 bps, and all reads +aligning to the – strand were offset −5 bps. + +python shiftbam.py < marked.bam > shifted.bam 2> shifted.log +""" + +import pysam +import csv +import sys + +def shift_bam(): + """ + Because of vagaries with pysam, this uses stdin and stdout: + https://pysam.readthedocs.io/en/latest/usage.html#using-streams + """ + infile = pysam.AlignmentFile("-", "rb") + outfile = pysam.AlignmentFile("-", "wb", template=infile) + + for n, read in enumerate(infile): + original_start = read.reference_start + # make the changes in the reference start + if read.is_reverse: + read.reference_start -= 5 + else: + read.reference_start += 4 + + # we can't go below 0 + if read.reference_start < 0: + print("Adjusted {} from {} to {} in {}, but setting to 0".format( + read.query_name, original_start, read.reference_start, read.reference_name), file=sys.stderr) + read.reference_start = 0 + # ended up not using inferred length because sometimes it is None? + if read.reference_start + read.query_length > infile.lengths[read.reference_id]: + new_location = infile.lengths[read.reference_id] - read.query_length + print("Adjusted {} from {} to {} in {}, but setting to {}".format( + read.query_name, original_start, read.reference_start, read.reference_name, new_location), file=sys.stderr) + read.reference_start = new_location + + outfile.write(read) + +if __name__ == "__main__": + + shift_bam()