From 68478fe02a2dd547072005c9ec92791fbe2fd46e Mon Sep 17 00:00:00 2001
From: Audra Johnson <audrakj@uw.edu>
Date: Thu, 2 May 2019 03:15:42 -0700
Subject: [PATCH 1/7] (feature) An ATAC focused aggregation process

---
 processes/bwa/aggregate/atac.nf               | 739 ++++++++++++++++++
 .../bwa/aggregate/basic/cutatacfragments.awk  |  28 +
 2 files changed, 767 insertions(+)
 create mode 100644 processes/bwa/aggregate/atac.nf
 create mode 100644 scripts/bwa/aggregate/basic/cutatacfragments.awk

diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf
new file mode 100644
index 00000000..42f26b6a
--- /dev/null
+++ b/processes/bwa/aggregate/atac.nf
@@ -0,0 +1,739 @@
+params.help = false
+params.threads = 1
+
+params.UMI = false
+params.genome = ""
+params.outdir = "output"
+params.domotifs = false
+params.dofeatures = false
+
+params.readlength = 36
+
+params.hotspot_index = "."
+
+params.bias = ""
+params.chunksize = 5000
+
+def helpMessage() {
+  log.info"""
+    Usage: nextflow run basic.nf \\
+             --genome /path/to/genome \\
+             --bams '1.bam,2.bam...' \\
+             --UMI true/false        \\
+             --outdir /path/to/output
+
+  """.stripIndent();
+}
+
+dataDir = "$baseDir/../../../data"
+genome_name = file(params.genome).baseName
+
+bams = Channel.from(
+  params.bams.tokenize(',')
+).map {
+  file(it)
+}.collect()
+
+
+process merge {
+  label "modules"
+
+  input:
+  file 'in*.bam' from bams
+
+  output:
+  file 'merged.bam' into merged
+
+  publishDir params.outdir
+
+  script:
+  """
+  samtools merge 'merged.bam' in*.bam
+  """
+}
+
+// TODO: single end
+process dups {
+  label "modules"
+  publishDir params.outdir
+  memory '16 GB'
+
+  input:
+  file(merged)
+
+  output:
+  file 'marked.bam' into marked_bam
+  file 'marked.bam.bai'
+  file 'MarkDuplicates.picard'
+
+  script:
+  if (params.UMI)
+    cmd = "UmiAwareMarkDuplicatesWithMateCigar"
+    extra = "UMI_TAG_NAME=XD"
+  if (!params.UMI)
+    cmd = "MarkDuplicatesWithMateCigar"
+    extra = ""
+  """
+  picard RevertOriginalBaseQualitiesAndAddMateCigar \
+    "INPUT=${merged}" OUTPUT=cigar.bam \
+    VALIDATION_STRINGENCY=SILENT RESTORE_ORIGINAL_QUALITIES=false SORT_ORDER=coordinate MAX_RECORDS_TO_EXAMINE=0
+
+  picard "${cmd}" \
+      INPUT=cigar.bam OUTPUT=marked.bam \
+      $extra \
+      METRICS_FILE=MarkDuplicates.picard ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT \
+      READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' \
+      MINIMUM_DISTANCE=300
+
+  samtools index marked.bam
+  """
+}
+marked_bam.into { bam_for_counts; bam_for_adapter_counts; bam_for_filter; bam_for_diff_peaks }
+
+process filter {
+  label "modules"
+
+  publishDir params.outdir
+
+  input:
+  file bam from bam_for_filter
+
+  output:
+  file "filtered.bam" into filtered_bam
+
+  script:
+  flag = params.UMI ? 1536 : 512
+  """
+  samtools view -b -F "${flag}" marked.bam > filtered.bam
+  """
+}
+filtered_bam.into { bam_for_hotspot2; bam_for_spot_score; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprints}
+
+process filter_nuclear {
+  label "modules"
+  input:
+  file bam from bam_for_nuclear
+  file nuclear_chroms from file("${params.genome}.nuclear.txt")
+
+  output:
+  file 'nuclear.bam' into nuclear_bam
+
+  script:
+  """
+  samtools index "${bam}"
+  cat "${nuclear_chroms}" \
+  | xargs samtools view -b "${bam}" \
+  > nuclear.bam
+  """
+}
+
+process hotspot2 {
+  label "modules"
+
+  publishDir "${params.outdir}"
+  container "fwip/hotspot2:latest"
+
+  input:
+  file(marked_bam) from bam_for_hotspot2
+  file(mappable) from file(params.mappable)
+  file(chrom_sizes) from file(params.chrom_sizes)
+  file(centers) from file(params.centers)
+
+
+  output:
+  file('peaks/filtered*')
+  file('peaks/filtered.hotspots.fdr0.05.starch') into hotspot_calls
+  file('peaks/filtered.hotspots.fdr0.05.starch') into hotspot_calls_for_bias
+  file('peaks/filtered.peaks.fdr0.001.starch') into onepercent_peaks
+
+  script:
+  """
+  export TMPDIR=\$(mktemp -d)
+  hotspot2.sh -F 0.5 -p varWidth_20_default \
+    -M "${mappable}" \
+    -c "${chrom_sizes}" \
+    -C "${centers}" \
+    "${marked_bam}" \
+    'peaks'
+
+  cd peaks
+
+  # Rename peaks files to include FDR
+  mv filtered.peaks.narrowpeaks.starch filtered.peaks.narrowpeaks.fdr0.05.starch
+  mv filtered.peaks.starch filtered.peaks.fdr0.05.starch
+
+  bash \$STAMPIPES/scripts/SPOT/info.sh \
+    filtered.hotspots.fdr0.05.starch hotspot2 filtered.SPOT.txt \
+    > filtered.hotspot2.info
+
+  # TODO: Move this to separate process
+  hsmerge.sh -f 0.01 filtered.allcalls.starch filtered.hotspots.fdr0.01.starch
+  hsmerge.sh -f 0.001 filtered.allcalls.starch filtered.hotspots.fdr0.001.starch
+
+  density-peaks.bash \$TMPDIR varWidth_20_default filtered.cutcounts.starch filtered.hotspots.fdr0.01.starch ../"${chrom_sizes}" filtered.density.starch filtered.peaks.fdr0.01.starch \$(cat filtered.cleavage.total)
+  density-peaks.bash \$TMPDIR varWidth_20_default filtered.cutcounts.starch filtered.hotspots.fdr0.001.starch ../"${chrom_sizes}" filtered.density.starch filtered.peaks.fdr0.001.starch \$(cat filtered.cleavage.total)
+
+  rm -rf "\$TMPDIR"
+  """
+
+}
+
+process spot_score {
+  label "modules"
+  publishDir params.outdir
+
+  input:
+  file(bam) from bam_for_spot_score
+  file(mappable) from file("${dataDir}/annotations/${genome_name}.K${params.readlength}.mappable_only.bed")
+  file(chromInfo) from file("${dataDir}/annotations/${genome_name}.chromInfo.bed")
+
+  output:
+  file 'r1.spot.out'
+  file 'r1.hotspot.info'
+
+  script:
+  """
+  # random sample
+	samtools view -h -F 12 -f 3 "$bam" \
+		| awk '{if( ! index(\$3, "chrM") && \$3 != "chrC" && \$3 != "random"){print}}' \
+		| samtools view -uS - \
+		> nuclear.bam
+	bash \$STAMPIPES/scripts/bam/random_sample.sh nuclear.bam subsample.bam 5000000
+  samtools view -b -f 0x0040 subsample.bam > r1.bam
+
+  # hotspot
+  bash \$STAMPIPES/scripts/SPOT/runhotspot.bash \
+    \$HOTSPOT_DIR \
+    \$PWD \
+    \$PWD/r1.bam \
+    "${genome_name}" \
+    "${params.readlength}" \
+    DNaseI
+
+  starch --header r1-both-passes/r1.hotspot.twopass.zscore.wig \
+    > r1.spots.starch
+
+  bash \$STAMPIPES/scripts/SPOT/info.sh \
+    r1.spots.starch hotspot1 r1.spot.out \
+    > r1.hotspot.info
+  """
+}
+
+process bam_counts {
+  label "modules"
+  publishDir params.outdir
+
+  input:
+  file(bam) from bam_for_counts
+
+  output:
+  file('tagcounts.txt')
+
+  script:
+  """
+  python3 \$STAMPIPES/scripts/bwa/bamcounts.py \
+    "$bam" \
+    tagcounts.txt
+  """
+}
+
+process count_adapters {
+  label "modules"
+  publishDir params.outdir
+
+  input:
+  file(bam) from bam_for_adapter_counts
+
+  output:
+  file('adapter.counts.txt')
+
+  script:
+  """
+  bash "\$STAMPIPES/scripts/bam/count_adapters.sh" "${bam}" \
+  | sed 's/^/adapter\t/' \
+  > adapter.counts.txt
+  """
+}
+
+process preseq {
+  label "modules"
+  publishDir params.outdir
+  input:
+  file nuclear_bam
+
+  output:
+  file 'preseq.txt'
+  file 'preseq_targets.txt'
+  file 'dups.hist'
+
+  script:
+  """
+  python3 \$STAMPIPES/scripts/bam/mark_dups.py -i "${nuclear_bam}" -o /dev/null --hist dups.hist
+  preseq lc_extrap -hist dups.hist -extrap 1.001e9 -s 1e6 -v > preseq.txt \
+  || preseq lc_extrap -defects -hist dups.hist -extrap 1.001e9 -s 1e6 -v > preseq.txt
+
+  # write out preseq targets
+  bash "\$STAMPIPES/scripts/utility/preseq_targets.sh" preseq.txt preseq_targets.txt
+  """
+}
+
+process cutcounts {
+  label "modules"
+
+  publishDir params.outdir
+
+  input:
+  file(fai) from file("${params.genome}.fai")
+  file(filtered_bam) from bam_for_cutcounts
+
+  output:
+  file('fragments.starch')
+  file('cutcounts.starch')
+  file('cutcounts.bw')
+  file('cutcounts.bed.bgz')
+  file('cutcounts.bed.bgz.tbi')
+
+  script:
+  """
+  bam2bed --do-not-sort \
+  < "$filtered_bam" \
+  | awk -v cutfile=cuts.bed -v fragmentfile=fragments.bed -f \$STAMPIPES/scripts/bwa/aggregate/basic/cutatacfragments.awk
+
+  sort-bed fragments.bed | starch - > fragments.starch
+  sort-bed cuts.bed | starch - > cuts.starch
+
+  unstarch cuts.starch \
+  | cut -f1-3 \
+  | bedops -m - \
+  | awk '{ for(i = \$2; i < \$3; i += 1) { print \$1"\t"i"\t"i + 1 }}' \
+  > allbase.tmp
+
+  unstarch cuts.starch \
+  | bedmap --echo --count --delim "\t" allbase.tmp - \
+  | awk '{print \$1"\t"\$2"\t"\$3"\tid-"NR"\t"\$4}' \
+  | starch - > cutcounts.starch
+
+  # Bigwig
+  "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \
+    cutcounts.starch \
+    cutcounts.bw \
+    "${fai}"
+
+  # tabix
+  unstarch cutcounts.starch | bgzip > cutcounts.bed.bgz
+  tabix -p bed cutcounts.bed.bgz
+  """
+}
+
+process density {
+  label "modules"
+
+  publishDir params.outdir
+
+  input:
+  file filtered_bam from bam_for_density
+  file chrom_bucket from file(params.chrom_bucket)
+  file fai from file("${params.genome}.fai")
+
+  output:
+  file 'density.starch'
+  file 'density.bw'
+  file 'density.bgz'
+  file 'density.bgz.tbi'
+  set(file(filtered_bam), file('density.starch')) into to_normalize
+
+  shell:
+  window_size = 75
+  bin_size = 20
+  scale = 1_000_000
+  '''
+  mkfifo density.bed
+
+  bam2bed -d \
+  < "!{filtered_bam}" \
+  | cut -f1-6 \
+  | awk '{ if( $6=="+" ){ s=$2; e=$2+1 } else { s=$3-1; e=$3 } print $1 "\t" s "\t" e "\tid\t" 1 }' \
+  | sort-bed - \
+  > density.bed \
+  &
+
+  unstarch "!{chrom_bucket}" \
+  | bedmap --faster --echo --count --delim "\t" - density.bed \
+  | awk -v "binI=!{bin_size}" -v "win=!{window_size}" \
+        'BEGIN{ halfBin=binI/2; shiftFactor=win-halfBin } { print $1 "\t" $2 + shiftFactor "\t" $3-shiftFactor "\tid\t" i $4}' \
+  | starch - \
+  > density.starch
+
+  # Bigwig
+  "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \
+    density.starch \
+    density.bw \
+    "!{fai}" \
+    "!{bin_size}"
+
+  # Tabix
+  unstarch density.starch | bgzip > density.bgz
+  tabix -p bed density.bgz
+  '''
+
+}
+
+process normalize_density {
+  label "modules"
+  publishDir params.outdir
+
+  input:
+  set(file(filtered_bam), file(density)) from to_normalize
+  file(fai) from file("${params.genome}.fai")
+
+  output:
+  file 'normalized.density.starch'
+  file 'normalized.density.bw'
+  file 'normalized.density.bgz'
+  file 'normalized.density.bgz.tbi'
+
+  shell:
+  bin_size = 20
+  scale = 1_000_000
+  '''
+  # Normalized density
+  unstarch density.starch \
+    | awk -v allcounts=$(samtools view -c !{filtered_bam}) \
+          -v extranuclear_counts=$(samtools view -c "!{filtered_bam}" chrM chrC) \
+          -v scale=!{scale} \
+          'BEGIN{ tagcount=allcounts-extranuclear_counts }
+           { z=$5;
+             n=(z/tagcount)*scale;
+             print $1 "\t" $2 "\t" $3 "\t" $4 "\t" n }' \
+    | starch - > normalized.density.starch
+
+  "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \
+    normalized.density.starch \
+    normalized.density.bw \
+    "!{fai}" \
+    "!{bin_size}"
+
+  unstarch normalized.density.starch | bgzip > normalized.density.bgz
+  tabix -p bed normalized.density.bgz
+  '''
+
+}
+
+process insert_sizes {
+  label "modules"
+
+  publishDir params.outdir
+
+  input:
+  file nuclear_bam from bam_for_inserts
+  file nuclear_chroms from file("${params.genome}.nuclear.txt")
+
+  output:
+  file 'CollectInsertSizeMetrics.picard*'
+
+  script:
+  """
+  picard CollectInsertSizeMetrics \
+    "INPUT=${nuclear_bam}" \
+    OUTPUT=CollectInsertSizeMetrics.picard \
+    HISTOGRAM_FILE=CollectInsertSizeMetrics.picard.pdf \
+    VALIDATION_STRINGENCY=LENIENT \
+    ASSUME_SORTED=true
+
+  cat CollectInsertSizeMetrics.picard \
+  | awk '/## HISTOGRAM/{x=1;next}x' \
+  | sed 1d \
+  > hist.txt
+
+  python3 "\$STAMPIPES/scripts/utility/picard_inserts_process.py" hist.txt > CollectInsertSizeMetrics.picard.info
+  """
+}
+
+process motif_matrix {
+  label "modules"
+
+  publishDir params.outdir
+
+  input:
+  file hotspot_calls
+  file fimo_transfac from file("${dataDir}/motifs/${genome_name}.fimo.starch")
+  file fimo_names from file("${dataDir}/motifs/${genome_name}.fimo.transfac.names.txt")
+
+  output:
+  file 'hs_motifs*.txt'
+
+  when:
+  params.domotifs
+
+  script:
+  """
+  # create sparse motifs
+  bedmap --echo --echo-map-id --fraction-map 1 --delim '\t' "${hotspot_calls}" "${fimo_transfac}" > temp.bedmap.txt
+  python \$STAMPIPES/scripts/bwa/aggregate/basic/sparse_motifs.py "${fimo_names}" temp.bedmap.txt
+  """
+}
+
+process closest_features {
+  label "modules"
+
+  publishDir params.outdir
+
+  input:
+  file hotspot_calls
+  file transcript_starts from file("${dataDir}/features/${genome_name}.CombinedTxStarts.bed")
+  val thresholds from "0 1000 2500 5000 10000"
+
+  when:
+  params.dofeatures
+
+  output:
+  file 'prox_dist.info'
+
+  script:
+  """
+  closest-features \
+    --dist \
+    --delim '\t' \
+    --closest \
+    "${hotspot_calls}" \
+    "${transcript_starts}" \
+  > closest.txt
+  cat closest.txt \
+  | grep -v "NA\$" \
+  | awk -F"\t" '{print \$NF}' \
+  | sed -e 's/-//g' \
+  > closest.clean.txt
+
+  for t in $thresholds ; do
+    awk \
+      -v t=\$t \
+      '\$1 > t {sum+=1} END {print "percent-proximal-" t "bp " sum/NR}' \
+      closest.clean.txt \
+    >> prox_dist.info
+  done
+  """
+}
+
+/*
+ * Metrics: Hotspot differential index comparison
+ */
+process differential_hotspots {
+  label "modules"
+
+  publishDir params.outdir
+
+  input:
+  file bam from bam_for_diff_peaks
+  file peaks from onepercent_peaks
+  file index from file(params.hotspot_index)
+
+  output:
+  file 'differential_index_report.tsv'
+
+  when:
+  params.hotspot_index != "."
+
+  shell:
+  version = (new File(params.hotspot_index)).getAbsoluteFile().getParentFile().getName()
+  diffName = "dhsindex_${version}_differential_peaks"
+  diffPerName = "dhsindex_${version}_differential_peaks_percent"
+  conName = "dhsindex_${version}_constitutive_peaks"
+  conPerName = "dhsindex_${version}_constitutive_peaks_percent"
+
+  '''
+  set -e -o pipefail
+  statOverlap=$(bedops -e 1 "!{peaks}" "!{index}" | wc -l)
+  statNoOverlap=$(bedops -n 1 "!{peaks}" "!{index}" | wc -l)
+  total=$(unstarch "!{peaks}" | wc -l)
+  statPercOverlap=$(echo "scale=3; $statOverlap * 100.0/$total" | bc -q)
+  statPercNoOverlap=$(echo "scale=3; $statNoOverlap * 100.0/$total" | bc -q)
+
+  {
+    echo -e "!{diffName}\t$statNoOverlap"
+    echo -e "!{diffPerName}\t$statPercNoOverlap"
+    echo -e "!{conName}\t$statOverlap"
+    echo -e "!{conPerName}\t$statPercOverlap"
+  } > differential_index_report.tsv
+  '''
+}
+
+
+/*
+ * Footprint calling
+ */
+process learn_dispersion {
+
+  label "footprints"
+  publishDir params.outdir
+
+  memory = '8 GB'
+  cpus = 8
+
+  when:
+  params.bias != ""
+
+  input:
+  file ref from file("${params.genome}.fa")
+  file bam from bam_for_footprints
+  //file bai
+  file spots from hotspot_calls_for_bias
+  file bias from file(params.bias)
+
+  output:
+  set file('dm.json'), file(bam), file ("${bam}.bai") into dispersion
+
+  script:
+  """
+  samtools index "$bam"
+
+  # TODO: Use nuclear file
+  unstarch $spots \
+  | grep -v "_random" \
+  | grep -v "chrUn" \
+  | grep -v "chrM" \
+  > intervals.bed
+
+  ftd-learn-dispersion-model \
+    --bm $bias \
+    --half-win-width 5 \
+    --processors 8 \
+    $bam \
+    $ref \
+    intervals.bed \
+  > dm.json
+  """.stripIndent()
+
+}
+
+process make_intervals {
+
+  label "footprints"
+  input:
+  file starch from hotspot_calls
+
+  output:
+  file 'chunk_*' into intervals mode flatten
+
+  script:
+  """
+  unstarch "$starch" \
+  | grep -v "_random" \
+  | grep -v "chrUn" \
+  | grep -v "chrM" \
+  | split -l "$params.chunksize" -a 4 -d - chunk_
+  """.stripIndent()
+
+}
+
+process compute_deviation {
+
+  label "footprints"
+  memory = '8 GB'
+  cpus = 4
+
+  input:
+  set file(interval), file(dispersion), file(bam), file(bai) from intervals.combine(dispersion)
+  file(bias) from file(params.bias)
+  file(ref) from file("${params.genome}.fa")
+
+  output:
+  file 'deviation.out' into deviations
+
+  script:
+  """
+  ftd-compute-deviation \
+  --bm "$bias" \
+  --half-win-width 5 \
+  --smooth-half-win-width 50 \
+  --smooth-clip 0.01 \
+  --dm "$dispersion" \
+  --fdr-shuffle-n 50 \
+  --processors 4 \
+  "$bam" \
+  "$ref" \
+  "$interval" \
+  | sort --buffer-size=8G -k1,1 -k2,2n \
+  > deviation.out
+  """.stripIndent()
+}
+
+process merge_deviation {
+
+  label "footprints"
+  memory = "32 GB"
+  cpus = 1
+
+  when:
+  params.bias != ""
+
+  input:
+  file 'chunk_*' from deviations.collect()
+
+  output:
+  file 'interval.all.bedgraph' into merged_interval
+
+  script:
+  """
+  echo chunk_*
+  sort -k1,1 -k2,2n -S 32G -m chunk_* > interval.all.bedgraph
+  """.stripIndent()
+}
+
+process working_tracks {
+
+  label "footprints"
+  memory = '32 GB'
+  cpus = 1
+
+  publishDir params.outdir
+
+  input:
+  file merged_interval
+
+  output:
+  file 'interval.all.bedgraph' into bedgraph
+  file 'interval.all.bedgraph.starch'
+  file 'interval.all.bedgraph.gz'
+  file 'interval.all.bedgraph.gz.tbi'
+
+  script:
+  """
+  sort-bed "$merged_interval" | starch - > interval.all.bedgraph.starch
+  bgzip -c "$merged_interval" > interval.all.bedgraph.gz
+  tabix -0 -p bed interval.all.bedgraph.gz
+  """.stripIndent()
+
+}
+
+thresholds = Channel.from(0.2, 0.1, 0.05, 0.01, 0.001, 0.0001)
+
+process compute_footprints {
+
+  label "footprints"
+  memory = '8 GB'
+  cpus = 1
+
+  publishDir params.outdir
+
+  input:
+  set file(merged_interval), val(threshold) from merged_interval.combine(thresholds)
+
+  output:
+  file "interval.all.fps.${threshold}.bed.gz"
+  file "interval.all.fps.${threshold}.bed.gz.tbi"
+
+  script:
+  """
+  output=interval.all.fps.${threshold}.bed
+  cat "${merged_interval}" \
+  | awk -v OFS="\t" -v thresh="${threshold}" '\$8 <= thresh { print \$1, \$2-3, \$3+3; }' \
+	| sort-bed --max-mem 8G - \
+	| bedops -m - \
+	| awk -v OFS="\t" -v thresh="${threshold}" '{ \$4="."; \$5=thresh; print; }' \
+  > \$output
+
+  bgzip -c "\$output" > "\$output.gz"
+  tabix -0 -p bed "\$output.gz"
+  """.stripIndent()
+
+}
diff --git a/scripts/bwa/aggregate/basic/cutatacfragments.awk b/scripts/bwa/aggregate/basic/cutatacfragments.awk
new file mode 100644
index 00000000..2b76c161
--- /dev/null
+++ b/scripts/bwa/aggregate/basic/cutatacfragments.awk
@@ -0,0 +1,28 @@
+# Run with variables
+# cutfile -- BED output file for cuts
+# fragmentfile -- BED output file for fragments
+# This differs from the normal file by modifying the read start
+# and end cuts to match the ATAC protocol
+BEGIN{
+  FS="\t"
+  OFS="\t"
+}{
+  strand=$6;
+  read_start=$2;
+  read_end=$3;
+  read_id=$1;
+  flag=$7;
+  tlen=$11;
+  if( strand == "+" ){
+    cut_start = read_start + 4;
+    cut_end = cut_start + 1;
+  } else {
+    cut_start= read_end - 5;
+    cut_end = cut_start + 1;
+  }
+  print read_id, cut_start, cut_end, "id" , "1", strand > cutfile;
+  if (tlen > 0) {
+    fragment_end = read_start + tlen;
+    print read_id, read_start, fragment_end, ".", "." > fragmentfile;
+  }
+}

From ca746c5cd5b2181669feb0e523c49f03b15f23fa Mon Sep 17 00:00:00 2001
From: Audra Johnson <audrakj@uw.edu>
Date: Mon, 6 May 2019 06:44:33 -0700
Subject: [PATCH 2/7] Need a non-adapter trimmed version of alignment

---
 processes/bwa/process_bwa_paired.nf | 520 ++++++++++++++++++++++++++++
 1 file changed, 520 insertions(+)
 create mode 100755 processes/bwa/process_bwa_paired.nf

diff --git a/processes/bwa/process_bwa_paired.nf b/processes/bwa/process_bwa_paired.nf
new file mode 100755
index 00000000..d9f890bc
--- /dev/null
+++ b/processes/bwa/process_bwa_paired.nf
@@ -0,0 +1,520 @@
+#!/usr/bin/env nextflow
+/*
+ * This is a proof-of-concept workflow for running our DNase alignment pipeline
+ */
+
+params.help = false
+params.threads = 1
+params.chunk_size = 16000000
+
+params.UMI = false
+params.trim_to = 0
+params.genome = ""
+params.r1 = ""
+params.r2 = ""
+params.outdir = "output"
+params.readlength = 36
+
+nuclear_chroms = "$params.genome" + ".nuclear.txt"
+dataDir = "$baseDir/../../data"
+
+def helpMessage() {
+  log.info"""
+    Usage: nextflow run process_bwa_paired.nf \\
+             --r1 r1.fastq.gz \\
+             --r2 r2.fastq.gz \\
+             --genome /path/to/genome \\
+    Options:
+    --threads [count]      The number of threads that will be used for child applications  (1)
+    --chunk_size [count]   How many reads to process per chunk                             (16000000)
+    --UMI                  The reads contain UMI markers ('single-strand', 'True')         (false)
+    --trim_to [length]     Trim fastq reads to [length] bp (0 for no trimming)             (0)
+    --outdir [dir]         Where to write results to                                       (output)
+    --read_length [length] How long our reads are                                          (36)
+    """.stripIndent();
+}
+
+if (params.help || !params.r1 || !params.r2 || !params.genome){
+  helpMessage();
+  exit 0;
+}
+
+// Some renaming for easier usage later
+genome = params.genome
+genome_name = file(params.genome).baseName
+threads = params.threads
+
+/*
+ * Step 0: Split Fastq into chunks
+ */
+fastq_line_chunks = 4 * params.chunk_size
+process split_r1_fastq {
+
+  input:
+  file(r1) from file(params.r1)
+  val fastq_line_chunks
+
+  output:
+  file('split_r1*gz') into split_r1 mode flatten
+
+  script:
+  """
+  zcat $r1 \
+  | split -l $fastq_line_chunks \
+    --filter='gzip -1 > \$FILE.gz' - 'split_r1'
+  """
+}
+process split_r2_fastq {
+  input:
+  file(r2) from file(params.r2)
+  val fastq_line_chunks
+
+  output:
+  file('split_r2*gz') into split_r2 mode flatten
+
+  script:
+  """
+  zcat $r2 \
+  | split -l $fastq_line_chunks \
+    --filter='gzip -1 > \$FILE.gz' - 'split_r2'
+  """
+}
+
+/*
+ * Step 1: Trim to the appropriate length
+ */
+process trim_to_length {
+
+  input:
+  file split_r1
+  file split_r2
+
+  output:
+  set file('r1.trim.fastq.gz'), file('r2.trim.fastq.gz') into trimmed_fastq
+
+  script:
+  if (params.trim_to != 0)
+    // TODO: Add padding to length with N's
+    """
+    zcat $split_r1 | awk 'NR%2==0 {print substr(\$0, 1, $params.trim_to)} NR%2!=0' | gzip -c -1 > r1.trim.fastq.gz
+    zcat $split_r2 | awk 'NR%2==0 {print substr(\$0, 1, $params.trim_to)} NR%2!=0' | gzip -c -1 > r2.trim.fastq.gz
+    """
+  else
+    """
+    ln -s $split_r1 r1.trim.fastq.gz
+    ln -s $split_r2 r2.trim.fastq.gz
+    """
+
+}
+
+process add_umi_info {
+
+  input:
+  set file(r1), file(r2) from trimmed_fastq
+
+  output:
+  set file('r1.fastq.umi.gz'), file('r2.fastq.umi.gz') into with_umi
+
+  script:
+  if (params.UMI == 'thruplex')
+    """
+    python3 \$STAMPIPES/scripts/umi/extract_umt.py \
+      <(zcat $r1) \
+      <(zcat $r2) \
+      >(gzip -c -1 > r1.fastq.umi.gz) \
+      >(gzip -c -1 > r2.fastq.umi.gz)
+    """
+  else if (params.UMI == 'True') // TODO: Is this right?
+    """
+    python3 \$STAMPIPES/scripts/umi/fastq_umi_add.py $r1 r1.fastq.umi.gz
+    python3 \$STAMPIPES/scripts/umi/fastq_umi_add.py $r2 r2.fastq.umi.gz
+    """
+
+  else if (params.UMI == false || params.UMI == "")
+    """
+    ln -s $r1 r1.fastq.umi.gz
+    ln -s $r2 r2.fastq.umi.gz
+    """
+  else
+    error "--UMI must be `thruplex`, `True` (for single-strand preparation), or false"
+}
+
+/*
+ * Metrics: Fastq counts
+ */
+process fastq_counts {
+
+  input:
+  file(r1) from file(params.r1)
+  file(r2) from file(params.r2)
+
+  output:
+  file 'fastq.counts' into fastq_counts
+
+  script:
+  """
+  zcat $r1 \
+  | awk -v paired=1 -f \$STAMPIPES/awk/illumina_fastq_count.awk \
+  > fastq.counts
+  """
+}
+
+/*
+ * Step 2a: Create alignment files
+ */
+process align {
+
+  cpus params.threads
+
+  input:
+  set file(trimmed_r1), file(trimmed_r2) from with_umi
+
+  file genome from file(params.genome)
+  file '*' from file("${params.genome}.amb")
+  file '*' from file("${params.genome}.ann")
+  file '*' from file("${params.genome}.bwt")
+  file '*' from file("${params.genome}.fai")
+  file '*' from file("${params.genome}.pac")
+  file '*' from file("${params.genome}.sa")
+
+
+  output:
+  file 'out.bam' into unfiltered_bam
+
+  script:
+  """
+  ls
+  bwa aln \
+    -Y -l 32 -n 0.04 \
+    -t "${params.threads}" \
+    "$genome" \
+    "$trimmed_r1" \
+    > out1.sai
+
+  bwa aln \
+    -Y -l 32 -n 0.04 \
+    -t "$params.threads" \
+    "$genome" \
+    "$trimmed_r2" \
+    > out2.sai
+
+  bwa sampe \
+    -n 10 -a 750 \
+    "$genome" \
+    out1.sai out2.sai \
+    "$trimmed_r1" "$trimmed_r2" \
+  | samtools view -b -t "$genome".fai - \
+  > out.bam
+  """
+
+}
+
+/*
+ * Step 2b: filter bam files to have only good reads
+ */
+process filter_bam {
+
+  input:
+  file unfiltered_bam
+  file nuclear_chroms from file(nuclear_chroms)
+
+  output:
+  file 'filtered.bam' into filtered_bam
+
+  script:
+  """
+  python3 \$STAMPIPES/scripts/bwa/filter_reads.py \
+  "$unfiltered_bam" \
+  filtered.bam \
+  "$nuclear_chroms"
+  """
+}
+
+/*
+ * Step 2c: sort bam files
+ */
+process sort_bam {
+
+  cpus params.threads
+
+  input:
+  file filtered_bam
+
+  output:
+  file 'sorted.bam' into sorted_bam
+
+  script:
+  """
+  samtools sort \
+    -l 0 -m 1G -@ "${params.threads}" "$filtered_bam" \
+    > sorted.bam
+  """
+}
+
+/*
+ * Step 3: Merge alignments into one big ol' file
+ */
+process merge_bam {
+  input:
+  file 'sorted_bam_*' from sorted_bam.collect()
+
+  output:
+  file 'merged.bam' into merged_bam
+
+  script:
+  """
+  samtools merge merged.bam sorted_bam*
+  samtools index merged.bam
+  """
+}
+
+/*
+ * Step 4: Mark duplicates with Picard
+ */
+process mark_duplicates {
+
+  label "high_mem"
+
+  publishDir params.outdir
+
+  input:
+  file(merged_bam) from merged_bam
+
+  output:
+  file 'marked.bam' into marked_bam
+  file 'marked.bam' into marked_bam_for_counts
+  file 'MarkDuplicates.picard'
+
+
+  script:
+  if (params.UMI)
+    """
+    picard RevertOriginalBaseQualitiesAndAddMateCigar \
+      INPUT=$merged_bam OUTPUT=cigar.bam \
+      VALIDATION_STRINGENCY=SILENT RESTORE_ORIGINAL_QUALITIES=false SORT_ORDER=coordinate MAX_RECORDS_TO_EXAMINE=0
+
+    picard UmiAwareMarkDuplicatesWithMateCigar INPUT=cigar.bam OUTPUT=marked.bam \
+      METRICS_FILE=MarkDuplicates.picard UMI_TAG_NAME=XD ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT \
+      READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' \
+      MINIMUM_DISTANCE=300
+    """
+  else
+    """
+    picard RevertOriginalBaseQualitiesAndAddMateCigar \
+      INPUT=$merged_bam OUTPUT=cigar.bam \
+      VALIDATION_STRINGENCY=SILENT RESTORE_ORIGINAL_QUALITIES=false SORT_ORDER=coordinate MAX_RECORDS_TO_EXAMINE=0
+
+    picard MarkDuplicatesWithMateCigar INPUT=cigar.bam OUTPUT=marked.bam \
+      METRICS_FILE=MarkDuplicates.picard ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT \
+      READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' \
+      MINIMUM_DISTANCE=300
+    """
+}
+
+/*
+ * Step 5: Filter bam file
+ */
+filter_flag = 512
+if (params.UMI)
+  filter_flag = 1536
+
+process filter_bam_to_unique {
+
+  publishDir params.outdir
+
+  input:
+  file marked_bam
+
+  output:
+  set file('filtered.bam'), file('filtered.bam.bai') into uniquely_mapping_bam
+
+  script:
+  """
+  samtools view $marked_bam -b -F $filter_flag > filtered.bam
+  samtools index filtered.bam
+  """
+
+}
+
+// Marked bam file is used by several downstream results
+uniquely_mapping_bam.into { bam_for_insert; bam_for_spot; bam_for_density }
+
+/*
+ * Metrics: bam counts
+ */
+process bam_counts {
+
+
+  input:
+  file(sorted_bam) from marked_bam_for_counts
+
+  output:
+  file('bam.counts.txt') into bam_counts
+
+  script:
+  """
+  python3 \$STAMPIPES/scripts/bwa/bamcounts.py \
+    "$sorted_bam" \
+    bam.counts.txt
+  """
+}
+
+/*
+ * Metrics: Insert size
+ */
+process insert_size {
+
+  publishDir params.outdir
+
+  input:
+  set file(bam), file(bai) from bam_for_insert
+
+  output:
+  file 'CollectInsertSizeMetrics.picard'
+  file 'CollectInsertSizeMetrics.picard.pdf'
+
+  script:
+  """
+  samtools idxstats "$bam" \
+  | cut -f 1 \
+  | grep -v chrM \
+  | grep -v chrC \
+  | xargs samtools view -b "$bam" \
+  > nuclear.bam
+
+  picard CollectInsertSizeMetrics \
+    INPUT=nuclear.bam \
+    OUTPUT=CollectInsertSizeMetrics.picard \
+    HISTOGRAM_FILE=CollectInsertSizeMetrics.picard.pdf \
+    VALIDATION_STRINGENCY=LENIENT \
+    ASSUME_SORTED=true
+  """
+}
+
+/*
+ * Metrics: SPOT score
+ */
+
+process spot_score {
+
+  publishDir params.outdir
+
+  input:
+  set file(bam), file(bai) from bam_for_spot
+  file('*') from file("${dataDir}/annotations/${genome_name}.K${params.readlength}.mappable_only.bed")
+  file('*') from file("${dataDir}/annotations/${genome_name}.chromInfo.bed")
+
+  output:
+  file 'subsample.r1.spot.out'
+  file 'spotdups.txt'
+
+  script:
+  """
+  # random sample
+  samtools view -h -F 12 -f 3 "$bam" \
+    | awk '{if( ! index(\$3, "chrM") && \$3 != "chrC" && \$3 != "random"){print}}' \
+    | samtools view -1 - \
+    -o paired.bam
+  bash \$STAMPIPES/scripts/bam/random_sample.sh paired.bam subsample.bam 5000000
+        samtools view -1 -f 0x0040 subsample.bam -o subsample.r1.bam
+
+  # hotspot
+  bash \$STAMPIPES/scripts/SPOT/runhotspot.bash \
+    \$HOTSPOT_DIR \
+    \$PWD \
+    \$PWD/subsample.r1.bam \
+    "${genome_name}" \
+    "${params.readlength}" \
+    DNaseI
+
+  # Remove existing duplication marks
+  picard RevertSam \
+    INPUT=subsample.bam \
+    OUTPUT=clear.bam \
+    VALIDATION_STRINGENCY=SILENT REMOVE_DUPLICATE_INFORMATION=true SORT_ORDER=coordinate \
+    RESTORE_ORIGINAL_QUALITIES=false REMOVE_ALIGNMENT_INFORMATION=false
+
+	picard MarkDuplicatesWithMateCigar \
+    INPUT=clear.bam \
+    METRICS_FILE=spotdups.txt \
+    OUTPUT=/dev/null \
+    ASSUME_SORTED=true \
+    MINIMUM_DISTANCE=300 \
+    VALIDATION_STRINGENCY=SILENT \
+		READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*'
+  """
+}
+
+/*
+ * Density tracks (for browser)
+ */
+win = 75
+bini = 20
+process density_files {
+
+  publishDir params.outdir
+
+  input:
+  set file(bam), file(bai) from bam_for_density
+  file fai from file("${params.genome}.fai")
+  file density_buckets from file(
+    "$baseDir/../../data/densities/chrom-buckets.${genome_name}.${win}_${bini}.bed.starch"
+  )
+
+  output:
+  file 'density.bed.starch'
+  file 'density.bw'
+  file 'density.bed.bgz'
+
+
+  script:
+  """
+  bam2bed -d \
+    < $bam \
+    | cut -f1-6 \
+    | awk '{ if( \$6=="+" ){ s=\$2; e=\$2+1 } else { s=\$3-1; e=\$3 } print \$1 "\t" s "\t" e "\tid\t" 1 }' \
+    | sort-bed - \
+    > sample.bed
+
+  unstarch "${density_buckets}" \
+    | bedmap --faster --echo --count --delim "\t" - sample.bed \
+    | awk -v binI=$bini -v win=$win \
+        'BEGIN{ halfBin=binI/2; shiftFactor=win-halfBin } { print \$1 "\t" \$2 + shiftFactor "\t" \$3-shiftFactor "\tid\t" i \$4}' \
+    | starch - \
+    > density.bed.starch
+
+  unstarch density.bed.starch | awk -v binI=$bini -f \$STAMPIPES/awk/bedToWig.awk > density.wig
+
+  wigToBigWig -clip density.wig "${fai}" density.bw
+
+
+  unstarch density.bed.starch | bgzip > density.bed.bgz
+  tabix -p bed density.bed.bgz
+  """
+}
+
+/*
+ * Metrics: total counts
+ */
+process total_counts {
+
+  publishDir params.outdir
+
+  input:
+  file 'fastqcounts*' from fastq_counts.collect()
+  file 'bamcounts*' from bam_counts.collect()
+
+  output:
+  file 'tagcounts.txt'
+
+  script:
+  """
+  cat *counts* \
+  | awk '
+  { x[\$1] += \$2 }
+  END {for (i in x) print i "\t" x[i]}
+  ' \
+  | sort -k 1,1 \
+  > tagcounts.txt
+  """
+}

From b928bf54e729a0259daa83291fec14170f9f4d78 Mon Sep 17 00:00:00 2001
From: Audra Johnson <audrakj@uw.edu>
Date: Wed, 3 Jul 2019 12:06:49 -0700
Subject: [PATCH 3/7] Remove unneeded processes from ATAC nextflow

---
 processes/bwa/aggregate/atac.nf | 289 --------------------------------
 1 file changed, 289 deletions(-)

diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf
index 42f26b6a..02923d22 100644
--- a/processes/bwa/aggregate/atac.nf
+++ b/processes/bwa/aggregate/atac.nf
@@ -448,292 +448,3 @@ process insert_sizes {
   python3 "\$STAMPIPES/scripts/utility/picard_inserts_process.py" hist.txt > CollectInsertSizeMetrics.picard.info
   """
 }
-
-process motif_matrix {
-  label "modules"
-
-  publishDir params.outdir
-
-  input:
-  file hotspot_calls
-  file fimo_transfac from file("${dataDir}/motifs/${genome_name}.fimo.starch")
-  file fimo_names from file("${dataDir}/motifs/${genome_name}.fimo.transfac.names.txt")
-
-  output:
-  file 'hs_motifs*.txt'
-
-  when:
-  params.domotifs
-
-  script:
-  """
-  # create sparse motifs
-  bedmap --echo --echo-map-id --fraction-map 1 --delim '\t' "${hotspot_calls}" "${fimo_transfac}" > temp.bedmap.txt
-  python \$STAMPIPES/scripts/bwa/aggregate/basic/sparse_motifs.py "${fimo_names}" temp.bedmap.txt
-  """
-}
-
-process closest_features {
-  label "modules"
-
-  publishDir params.outdir
-
-  input:
-  file hotspot_calls
-  file transcript_starts from file("${dataDir}/features/${genome_name}.CombinedTxStarts.bed")
-  val thresholds from "0 1000 2500 5000 10000"
-
-  when:
-  params.dofeatures
-
-  output:
-  file 'prox_dist.info'
-
-  script:
-  """
-  closest-features \
-    --dist \
-    --delim '\t' \
-    --closest \
-    "${hotspot_calls}" \
-    "${transcript_starts}" \
-  > closest.txt
-  cat closest.txt \
-  | grep -v "NA\$" \
-  | awk -F"\t" '{print \$NF}' \
-  | sed -e 's/-//g' \
-  > closest.clean.txt
-
-  for t in $thresholds ; do
-    awk \
-      -v t=\$t \
-      '\$1 > t {sum+=1} END {print "percent-proximal-" t "bp " sum/NR}' \
-      closest.clean.txt \
-    >> prox_dist.info
-  done
-  """
-}
-
-/*
- * Metrics: Hotspot differential index comparison
- */
-process differential_hotspots {
-  label "modules"
-
-  publishDir params.outdir
-
-  input:
-  file bam from bam_for_diff_peaks
-  file peaks from onepercent_peaks
-  file index from file(params.hotspot_index)
-
-  output:
-  file 'differential_index_report.tsv'
-
-  when:
-  params.hotspot_index != "."
-
-  shell:
-  version = (new File(params.hotspot_index)).getAbsoluteFile().getParentFile().getName()
-  diffName = "dhsindex_${version}_differential_peaks"
-  diffPerName = "dhsindex_${version}_differential_peaks_percent"
-  conName = "dhsindex_${version}_constitutive_peaks"
-  conPerName = "dhsindex_${version}_constitutive_peaks_percent"
-
-  '''
-  set -e -o pipefail
-  statOverlap=$(bedops -e 1 "!{peaks}" "!{index}" | wc -l)
-  statNoOverlap=$(bedops -n 1 "!{peaks}" "!{index}" | wc -l)
-  total=$(unstarch "!{peaks}" | wc -l)
-  statPercOverlap=$(echo "scale=3; $statOverlap * 100.0/$total" | bc -q)
-  statPercNoOverlap=$(echo "scale=3; $statNoOverlap * 100.0/$total" | bc -q)
-
-  {
-    echo -e "!{diffName}\t$statNoOverlap"
-    echo -e "!{diffPerName}\t$statPercNoOverlap"
-    echo -e "!{conName}\t$statOverlap"
-    echo -e "!{conPerName}\t$statPercOverlap"
-  } > differential_index_report.tsv
-  '''
-}
-
-
-/*
- * Footprint calling
- */
-process learn_dispersion {
-
-  label "footprints"
-  publishDir params.outdir
-
-  memory = '8 GB'
-  cpus = 8
-
-  when:
-  params.bias != ""
-
-  input:
-  file ref from file("${params.genome}.fa")
-  file bam from bam_for_footprints
-  //file bai
-  file spots from hotspot_calls_for_bias
-  file bias from file(params.bias)
-
-  output:
-  set file('dm.json'), file(bam), file ("${bam}.bai") into dispersion
-
-  script:
-  """
-  samtools index "$bam"
-
-  # TODO: Use nuclear file
-  unstarch $spots \
-  | grep -v "_random" \
-  | grep -v "chrUn" \
-  | grep -v "chrM" \
-  > intervals.bed
-
-  ftd-learn-dispersion-model \
-    --bm $bias \
-    --half-win-width 5 \
-    --processors 8 \
-    $bam \
-    $ref \
-    intervals.bed \
-  > dm.json
-  """.stripIndent()
-
-}
-
-process make_intervals {
-
-  label "footprints"
-  input:
-  file starch from hotspot_calls
-
-  output:
-  file 'chunk_*' into intervals mode flatten
-
-  script:
-  """
-  unstarch "$starch" \
-  | grep -v "_random" \
-  | grep -v "chrUn" \
-  | grep -v "chrM" \
-  | split -l "$params.chunksize" -a 4 -d - chunk_
-  """.stripIndent()
-
-}
-
-process compute_deviation {
-
-  label "footprints"
-  memory = '8 GB'
-  cpus = 4
-
-  input:
-  set file(interval), file(dispersion), file(bam), file(bai) from intervals.combine(dispersion)
-  file(bias) from file(params.bias)
-  file(ref) from file("${params.genome}.fa")
-
-  output:
-  file 'deviation.out' into deviations
-
-  script:
-  """
-  ftd-compute-deviation \
-  --bm "$bias" \
-  --half-win-width 5 \
-  --smooth-half-win-width 50 \
-  --smooth-clip 0.01 \
-  --dm "$dispersion" \
-  --fdr-shuffle-n 50 \
-  --processors 4 \
-  "$bam" \
-  "$ref" \
-  "$interval" \
-  | sort --buffer-size=8G -k1,1 -k2,2n \
-  > deviation.out
-  """.stripIndent()
-}
-
-process merge_deviation {
-
-  label "footprints"
-  memory = "32 GB"
-  cpus = 1
-
-  when:
-  params.bias != ""
-
-  input:
-  file 'chunk_*' from deviations.collect()
-
-  output:
-  file 'interval.all.bedgraph' into merged_interval
-
-  script:
-  """
-  echo chunk_*
-  sort -k1,1 -k2,2n -S 32G -m chunk_* > interval.all.bedgraph
-  """.stripIndent()
-}
-
-process working_tracks {
-
-  label "footprints"
-  memory = '32 GB'
-  cpus = 1
-
-  publishDir params.outdir
-
-  input:
-  file merged_interval
-
-  output:
-  file 'interval.all.bedgraph' into bedgraph
-  file 'interval.all.bedgraph.starch'
-  file 'interval.all.bedgraph.gz'
-  file 'interval.all.bedgraph.gz.tbi'
-
-  script:
-  """
-  sort-bed "$merged_interval" | starch - > interval.all.bedgraph.starch
-  bgzip -c "$merged_interval" > interval.all.bedgraph.gz
-  tabix -0 -p bed interval.all.bedgraph.gz
-  """.stripIndent()
-
-}
-
-thresholds = Channel.from(0.2, 0.1, 0.05, 0.01, 0.001, 0.0001)
-
-process compute_footprints {
-
-  label "footprints"
-  memory = '8 GB'
-  cpus = 1
-
-  publishDir params.outdir
-
-  input:
-  set file(merged_interval), val(threshold) from merged_interval.combine(thresholds)
-
-  output:
-  file "interval.all.fps.${threshold}.bed.gz"
-  file "interval.all.fps.${threshold}.bed.gz.tbi"
-
-  script:
-  """
-  output=interval.all.fps.${threshold}.bed
-  cat "${merged_interval}" \
-  | awk -v OFS="\t" -v thresh="${threshold}" '\$8 <= thresh { print \$1, \$2-3, \$3+3; }' \
-	| sort-bed --max-mem 8G - \
-	| bedops -m - \
-	| awk -v OFS="\t" -v thresh="${threshold}" '{ \$4="."; \$5=thresh; print; }' \
-  > \$output
-
-  bgzip -c "\$output" > "\$output.gz"
-  tabix -0 -p bed "\$output.gz"
-  """.stripIndent()
-
-}

From 6d41982a0b034cd1d333483087f44e5ab5614baf Mon Sep 17 00:00:00 2001
From: Audra Johnson <audrakj@uw.edu>
Date: Mon, 8 Jul 2019 22:24:32 -0700
Subject: [PATCH 4/7] intermediate tracking commit to use to compare to new
 changes

---
 processes/bwa/aggregate/atac.nf | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf
index 02923d22..69d2ed13 100644
--- a/processes/bwa/aggregate/atac.nf
+++ b/processes/bwa/aggregate/atac.nf
@@ -107,7 +107,32 @@ process filter {
   samtools view -b -F "${flag}" marked.bam > filtered.bam
   """
 }
-filtered_bam.into { bam_for_hotspot2; bam_for_spot_score; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprints}
+filtered_bam.into { bam_for_shift; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprints }
+
+/*
+https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3959825/
+For peak calling and footprinting, we adjusted the read start sites to represent the center of the transposon binding event.
+Previous descriptions of the Tn5 transposase show that the transposon binds as a dimer and inserts two adapters separated by
+9 bps (main text ref. 11). Therefore, all reads aligning to the + strand were offset by +4 bps, and all reads
+aligning to the – strand were offset −5 bps.
+*/
+process shift {
+  label "modules"
+
+  publishDir params.outdir
+
+  input:
+  file filtered_bam from bam_for_shift
+
+  output:
+  file "shifted.bam" into shifted_bam
+
+  script:
+  """
+  python3 \$STAMPIPES/scripts/bwa/aggregate/basic/shiftatacbam.py < ${filtered_bam} > shifted.bam
+  """
+}
+shifted_bam.into { bam_for_hotspot2; bam_for_spot_score }
 
 process filter_nuclear {
   label "modules"

From 88d82f11f221dd83fbdefde3ab22cc72c1714096 Mon Sep 17 00:00:00 2001
From: Audra Johnson <audrakj@uw.edu>
Date: Thu, 11 Jul 2019 16:37:11 -0700
Subject: [PATCH 5/7] (fix) Update ATAC process with latest aggregation process

---
 processes/bwa/aggregate/atac.nf | 87 +++++++++++++++++++++++++--------
 1 file changed, 67 insertions(+), 20 deletions(-)

diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf
index 69d2ed13..d575fa17 100644
--- a/processes/bwa/aggregate/atac.nf
+++ b/processes/bwa/aggregate/atac.nf
@@ -9,11 +9,15 @@ params.dofeatures = false
 
 params.readlength = 36
 
-params.hotspot_index = "."
+params.peakcaller = "hotspot2"
 
 params.bias = ""
 params.chunksize = 5000
 
+params.hotspot_id = "default"
+params.hotspot_index = "."
+
+
 def helpMessage() {
   log.info"""
     Usage: nextflow run basic.nf \\
@@ -56,7 +60,7 @@ process merge {
 process dups {
   label "modules"
   publishDir params.outdir
-  memory '16 GB'
+  label 'high_mem'
 
   input:
   file(merged)
@@ -107,7 +111,7 @@ process filter {
   samtools view -b -F "${flag}" marked.bam > filtered.bam
   """
 }
-filtered_bam.into { bam_for_shift; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprints }
+filtered_bam.into { bam_for_shift; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprint; }
 
 /*
 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3959825/
@@ -119,20 +123,33 @@ aligning to the – strand were offset −5 bps.
 process shift {
   label "modules"
 
+  cpus params.threads
   publishDir params.outdir
 
   input:
   file filtered_bam from bam_for_shift
+  file nuclear_chroms from file("${params.genome}.nuclear.txt")
 
   output:
   file "shifted.bam" into shifted_bam
+  file "nuclear_shifted.bam" into nuclear_shifted_bam
 
   script:
   """
-  python3 \$STAMPIPES/scripts/bwa/aggregate/basic/shiftatacbam.py < ${filtered_bam} > shifted.bam
+  python3 \$STAMPIPES/scripts/bwa/aggregate/basic/shiftatacbam.py < ${filtered_bam} > unsorted_shifted.bam
+  # we have to resort after our shifting
+  samtools sort \
+    -l 0 -m 1G -@ "${params.threads}" unsorted_shifted.bam \
+    > shifted.bam
+  samtools index shifted.bam
+  # we also need a nuclear version
+  cat "${nuclear_chroms}" \
+  | xargs samtools view -b shifted.bam \
+  > nuclear_shifted.bam
+  samtools index nuclear_shifted.bam
   """
 }
-shifted_bam.into { bam_for_hotspot2; bam_for_spot_score }
+nuclear_shifted_bam.into { bam_for_hotspot2; bam_for_macs2; bam_for_spot_score; }
 
 process filter_nuclear {
   label "modules"
@@ -152,51 +169,80 @@ process filter_nuclear {
   """
 }
 
+process macs2 {
+  label "macs2"
+  publishDir "${params.outdir}/peaks_macs2"
+  scratch false
+
+  when:
+  params.peakcaller == "macs2"
+
+  input:
+  file bam from bam_for_macs2
+
+  output:
+  file 'NA_*'
+
+  script:
+  """
+  macs2 callpeak \
+    -t "$bam" \
+    -f BAMPE \
+    -g hs \
+    -B \
+    -q 0.01
+  """
+}
+
 process hotspot2 {
   label "modules"
 
   publishDir "${params.outdir}"
   container "fwip/hotspot2:latest"
 
+  when:
+  params.peakcaller == "hotspot2"
+
   input:
-  file(marked_bam) from bam_for_hotspot2
+  val(hotspotid) from params.hotspot_id
+  file(nuclear) from bam_for_hotspot2
   file(mappable) from file(params.mappable)
   file(chrom_sizes) from file(params.chrom_sizes)
   file(centers) from file(params.centers)
 
 
   output:
-  file('peaks/filtered*')
-  file('peaks/filtered.hotspots.fdr0.05.starch') into hotspot_calls
-  file('peaks/filtered.hotspots.fdr0.05.starch') into hotspot_calls_for_bias
-  file('peaks/filtered.peaks.fdr0.001.starch') into onepercent_peaks
+  file('peaks/nuclear_shifted*')
+  file('peaks/nuclear_shifted.hotspots.fdr0.05.starch') into hotspot_calls
+  file('peaks/nuclear_shifted.hotspots.fdr0.05.starch') into hotspot_calls_for_bias
+  file('peaks/nuclear_shifted.peaks.fdr0.001.starch') into onepercent_peaks
 
   script:
   """
   export TMPDIR=\$(mktemp -d)
-  hotspot2.sh -F 0.5 -p varWidth_20_default \
+  hotspot2.sh -F 0.5 -p "varWidth_20_${hotspotid}" \
     -M "${mappable}" \
     -c "${chrom_sizes}" \
     -C "${centers}" \
-    "${marked_bam}" \
+    "${nuclear}" \
     'peaks'
 
   cd peaks
 
   # Rename peaks files to include FDR
-  mv filtered.peaks.narrowpeaks.starch filtered.peaks.narrowpeaks.fdr0.05.starch
-  mv filtered.peaks.starch filtered.peaks.fdr0.05.starch
+  mv nuclear_shifted.peaks.narrowpeaks.starch nuclear_shifted.peaks.narrowpeaks.fdr0.05.starch
+  mv nuclear_shifted.peaks.starch nuclear_shifted.peaks.fdr0.05.starch
 
   bash \$STAMPIPES/scripts/SPOT/info.sh \
-    filtered.hotspots.fdr0.05.starch hotspot2 filtered.SPOT.txt \
-    > filtered.hotspot2.info
+    nuclear_shifted.hotspots.fdr0.05.starch hotspot2 nuclear_shifted.SPOT.txt \
+    > nuclear_shifted.hotspot2.info
 
   # TODO: Move this to separate process
-  hsmerge.sh -f 0.01 filtered.allcalls.starch filtered.hotspots.fdr0.01.starch
-  hsmerge.sh -f 0.001 filtered.allcalls.starch filtered.hotspots.fdr0.001.starch
+  hsmerge.sh -f 0.01 nuclear_shifted.allcalls.starch nuclear_shifted.hotspots.fdr0.01.starch
+  hsmerge.sh -f 0.001 nuclear_shifted.allcalls.starch nuclear_shifted.hotspots.fdr0.001.starch
 
-  density-peaks.bash \$TMPDIR varWidth_20_default filtered.cutcounts.starch filtered.hotspots.fdr0.01.starch ../"${chrom_sizes}" filtered.density.starch filtered.peaks.fdr0.01.starch \$(cat filtered.cleavage.total)
-  density-peaks.bash \$TMPDIR varWidth_20_default filtered.cutcounts.starch filtered.hotspots.fdr0.001.starch ../"${chrom_sizes}" filtered.density.starch filtered.peaks.fdr0.001.starch \$(cat filtered.cleavage.total)
+  density-peaks.bash \$TMPDIR "varWidth_20_${hotspotid}" nuclear_shifted.cutcounts.starch nuclear_shifted.hotspots.fdr0.01.starch ../"${chrom_sizes}" nuclear_shifted.density.starch nuclear_shifted.peaks.fdr0.01.starch \$(cat nuclear_shifted.cleavage.total)
+  density-peaks.bash \$TMPDIR "varWidth_20_${hotspotid}" nuclear_shifted.cutcounts.starch nuclear_shifted.hotspots.fdr0.001.starch ../"${chrom_sizes}" nuclear_shifted.density.starch nuclear_shifted.peaks.fdr0.001.starch \$(cat nuclear_shifted.cleavage.total)
 
   rm -rf "\$TMPDIR"
   """
@@ -354,6 +400,7 @@ process density {
   label "modules"
 
   publishDir params.outdir
+  label 'high_mem'
 
   input:
   file filtered_bam from bam_for_density

From a107634db37784a0c749101a258cb8c83b42c5c7 Mon Sep 17 00:00:00 2001
From: Audra Johnson <audrakj@uw.edu>
Date: Fri, 12 Jul 2019 14:22:55 -0700
Subject: [PATCH 6/7] (fix) Add SPOT dup score to aggregate process

---
 processes/bwa/aggregate/atac.nf | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/processes/bwa/aggregate/atac.nf b/processes/bwa/aggregate/atac.nf
index d575fa17..c4832444 100644
--- a/processes/bwa/aggregate/atac.nf
+++ b/processes/bwa/aggregate/atac.nf
@@ -261,6 +261,7 @@ process spot_score {
   output:
   file 'r1.spot.out'
   file 'r1.hotspot.info'
+  file 'spotdups.picard'
 
   script:
   """
@@ -287,6 +288,31 @@ process spot_score {
   bash \$STAMPIPES/scripts/SPOT/info.sh \
     r1.spots.starch hotspot1 r1.spot.out \
     > r1.hotspot.info
+
+  # Create the SPOT dup score
+  # RevertSam will barf if it's not sorted exactly how it likes apparently?
+  # Exception in thread "main" java.lang.IllegalArgumentException: Alignments added 
+  #   out of order in SAMFileWriterImpl.addAlignment for /tmp/nxf.ZbudE8UGBY/clear.bam.
+  #   Sort order is coordinate. Offending records are at [chr1:195371075] and [chr10:3100064]
+  picard SortSam \
+      INPUT=subsample.bam \
+      OUTPUT=sorted_subsample.bam \
+      SORT_ORDER=coordinate
+  # Remove existing duplication marks
+  picard RevertSam \
+    INPUT=sorted_subsample.bam \
+    OUTPUT=clear.bam \
+    VALIDATION_STRINGENCY=SILENT REMOVE_DUPLICATE_INFORMATION=true SORT_ORDER=coordinate \
+    RESTORE_ORIGINAL_QUALITIES=false REMOVE_ALIGNMENT_INFORMATION=false
+  # Make the dup scare
+	picard MarkDuplicatesWithMateCigar \
+    INPUT=clear.bam \
+    METRICS_FILE=spotdups.picard \
+    OUTPUT=/dev/null \
+    ASSUME_SORTED=true \
+    MINIMUM_DISTANCE=300 \
+    VALIDATION_STRINGENCY=SILENT \
+		READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*'
   """
 }
 

From aad095b387560b831dfb6e15a754fd3a8bf1094b Mon Sep 17 00:00:00 2001
From: Audra Johnson <audrakj@uw.edu>
Date: Wed, 17 Jul 2019 08:11:37 -0700
Subject: [PATCH 7/7] Add shift bam python process

---
 scripts/bwa/aggregate/basic/shiftatacbam.py | 47 +++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 scripts/bwa/aggregate/basic/shiftatacbam.py

diff --git a/scripts/bwa/aggregate/basic/shiftatacbam.py b/scripts/bwa/aggregate/basic/shiftatacbam.py
new file mode 100644
index 00000000..7a5de163
--- /dev/null
+++ b/scripts/bwa/aggregate/basic/shiftatacbam.py
@@ -0,0 +1,47 @@
+"""
+https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3959825/
+For peak calling and footprinting, we adjusted the read start sites to represent the center of the transposon binding event.
+Previous descriptions of the Tn5 transposase show that the transposon binds as a dimer and inserts two adapters separated by
+9 bps (main text ref. 11). Therefore, all reads aligning to the + strand were offset by +4 bps, and all reads
+aligning to the – strand were offset −5 bps.
+
+python shiftbam.py < marked.bam > shifted.bam 2> shifted.log
+"""
+
+import pysam
+import csv
+import sys
+
+def shift_bam():
+    """
+    Because of vagaries with pysam, this uses stdin and stdout:
+    https://pysam.readthedocs.io/en/latest/usage.html#using-streams
+    """
+    infile = pysam.AlignmentFile("-", "rb")
+    outfile = pysam.AlignmentFile("-", "wb", template=infile)
+
+    for n, read in enumerate(infile):
+        original_start = read.reference_start
+        # make the changes in the reference start
+        if read.is_reverse:
+            read.reference_start -= 5
+        else:
+            read.reference_start += 4
+
+        # we can't go below 0
+        if read.reference_start < 0:
+            print("Adjusted {} from {} to {} in {}, but setting to 0".format(
+                read.query_name, original_start, read.reference_start, read.reference_name), file=sys.stderr)
+            read.reference_start = 0
+        # ended up not using inferred length because sometimes it is None?
+        if read.reference_start + read.query_length > infile.lengths[read.reference_id]:
+            new_location = infile.lengths[read.reference_id] - read.query_length
+            print("Adjusted {} from {} to {} in {}, but setting to {}".format(
+                read.query_name, original_start, read.reference_start, read.reference_name, new_location), file=sys.stderr)
+            read.reference_start = new_location
+
+        outfile.write(read)
+
+if __name__ == "__main__":
+
+    shift_bam()