Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ This is the **Hi-C** pipeline from the `Sequana <https://sequana.readthedocs.org
:Overview: Hi-C pipeline to capture 3D chromatin interactions in a genome
:Input: Paired FastQ files and a reference genome in FASTA format
:Output: Cooler contact matrices, Hi-C QC reports, and a MultiQC summary
:Status: Beta
:Status: Production
:Citation: Cokelaer et al, (2017), 'Sequana': a Set of Snakemake NGS pipelines, Journal of Open Source Software, 2(16), 352, JOSS DOI https://doi:10.21105/joss.00352


Expand Down Expand Up @@ -111,6 +111,7 @@ Changelog
========= ====================================================================
Version Description
========= ====================================================================
0.2.0 Production release.
0.1.0 Migration to modern sequana_pipetools framework (get_shell/get_run,
schema validation, apptainer support, Python 3.10+).
0.0.1 **First release.**
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "sequana-hic"
version = "0.1.0"
version = "0.2.0"
description = "Hi-C analysis, snakemake, sequana, container, reproducibility"
authors = ["Sequana Team"]
license = "BSD-3"
Expand Down
8 changes: 4 additions & 4 deletions sequana_pipelines/hic/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ apptainers:
chromap: https://zenodo.org/record/14721866/files/chromap_0.2.7.img
pairtools: https://zenodo.org/record/14746699/files/pairtools_1.1.2.img
qc3c: https://zenodo.org/record/14746667/files/qc3c_0.5.0.img
fastqc: "https://zenodo.org/record/7015004/files/fastqc_0.11.9-py3.img"
seqkit: https://zenodo.org/record/7821924/files/seqkit_2.4.0.img
fastp: "https://zenodo.org/record/17097941/files/fastp_1.0.1.img"
fastqc: https://zenodo.org/record/7015004/files/fastqc_0.11.9-py3.img
split_fasta: https://zenodo.org/record/18257162/files/sequana_tools_26.1.14.img
fastp: https://zenodo.org/record/17097941/files/fastp_1.0.1.img


#############################################################################
Expand Down Expand Up @@ -134,7 +134,7 @@ bwa:
bwa_split:
nreads: 100000
index_algorithm: is
options: -T 30 -M
options: -5SP -M
threads: 4
tmp_directory: ./tmp
resources:
Expand Down
11 changes: 8 additions & 3 deletions sequana_pipelines/hic/hic.rules
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ configfile: "config.yaml"


# A convenient manager
manager = sm.PipelineManager("hic", config, schema="schema.yaml")
manager = sm.PipelineManager("hic", config)

# chromap does not seem to work for this HiC data set.

Expand All @@ -53,6 +53,7 @@ if config["general"]["mapper"] == "chromap":
rule pipeline:
input:
expand("{sample}/chromap/{sample}.aln.pairs", sample=manager.samples),
expand("{sample}/qc3c/report.qc3C.html", sample=manager.samples),
"multiqc/multiqc_report.html",
".sequana/rulegraph.svg",
expand("{sample}/cooler/{sample}.contact.5000.png", sample=manager.samples),
Expand All @@ -62,6 +63,7 @@ else:

rule pipeline:
input: #expand("{sample}/bwa/{sample}.sorted.bam", sample=manager.samples),
expand("{sample}/qc3c/report.qc3C.html", sample=manager.samples),
"multiqc/multiqc_report.html",
".sequana/rulegraph.svg",
expand("{sample}/cooler/{sample}.contact.5000.png", sample=manager.samples),
Expand Down Expand Up @@ -216,10 +218,13 @@ elif config["general"]["mapper"] == "bwa_split":
params:
nreads=config["bwa_split"]["nreads"],
container:
config["apptainers"]["seqkit"]
config["apptainers"]["split_fasta"]
shell:
"""
seqkit split2 --by-size {params.nreads} -O {wildcards.sample}/split -1 {input[0]} -2 {input[1]} >{log} 2>&1
# seqkit split2 (32-bit binary) crashes on large inputs; use sequana fastq-split instead
mkdir -p {wildcards.sample}/split
sequana fastq-split {input[0]} --by-size {params.nreads} --gzip --pattern {wildcards.sample}/split/{wildcards.sample}_R1.#.fastq.gz > {log} 2>&1
sequana fastq-split {input[1]} --by-size {params.nreads} --gzip --pattern {wildcards.sample}/split/{wildcards.sample}_R2.#.fastq.gz >> {log} 2>&1
"""

def get_bwa_input():
Expand Down
2 changes: 1 addition & 1 deletion sequana_pipelines/hic/src/hic_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1067,7 +1067,7 @@ def write_report(self, quiet=False):
sub_str = template_string.format(**self.out_stats) # splat the statistics and path into the markdown, render as html

html = """<html><head><link rel="stylesheet" href="style.css"></head><body>"""
html += md.markdown(sub_str, extensions=['markdown.extensions.tables', 'markdown.extensions.nl2br'])
html += md.markdown(sub_str, extensions=['markdown.extensions.tables', 'markdown.extensions.nl2br', 'md_in_html'])

html += "</body></html>"
# write out just html
Expand Down
12 changes: 6 additions & 6 deletions sequana_pipelines/hic/src/report_template.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<center>
<center markdown="1">

# Hi-C Library QC Report

Expand All @@ -19,7 +19,7 @@
| Same strand high-quality* (HQ) read pairs (RPs) | {same_strand_hq_html} | > {same_strand_threshold}% |
| Informative RPs** | {informative_read_pairs_html} | > {informative_read_pairs_threshold}% |

<div class="small center">
<div class="small center" markdown="1">
<br />
*High-quality (HQ) read pairs have minimum mapping quality >= 20, maximum edit distance <= 5, and are not duplicates.<br>
**Informative read pairs are read pairs which have MAPQ > 0, are not PCR duplicates, and map to different contigs or >10kb apart.
Expand All @@ -34,7 +34,7 @@
| Fraction of HQ RPs Intercontig (CTGs >10kb)** | {intercontig_hq_contacts_html} | > {intercontig_hq_contacts_threshold}% |
| Informative HQ RPs per-contig per-1M-RPs (CTGs >5kb)*** | {usable_hq_gt_5k_per_million_html} | > {min_usable_reads_per_contig_per_million_threshold} |

<div class="small center">
<div class="small center" markdown="1">
<br />
*The proportion of <em>read pairs that span at least 10kb</em>, out of all read pairs that map (a) with high-quality, (b) to the same contig, (c) where that contig is at least 10kb long.<br>
**The proportion of <em>read pairs mapping to two different contigs each greater than 10kb</em>, out of all read pairs that map with high-quality.<br>
Expand All @@ -51,7 +51,7 @@
| Zero MAPQ reads | {many_zero_mapq_reads_html} | <= {many_zero_mapq_threshold}% |
| Unmapped reads | {many_unmapped_reads_html} | <= {many_unmapped_threshold}% |

<div class="small center">
<div class="small center" markdown="1">
<br />
*Note that the sum of informative and noninformative read pairs is not 100% because read pairs with mapping distance between 1b and 10kb are not classified as either informative or noninformative.<br>
Because noninformative reads can belong to more than one category, these numbers may sum to a value larger than the overall noninformative read pair amount at the top of the report.<br>
Expand All @@ -76,7 +76,7 @@ Because noninformative reads can belong to more than one category, these numbers

## Extended Library Statistics

<center>
<center markdown="1">

| Label | Library statistics | Expected values |
| :----------- | --------------------:| --------------------:|
Expand All @@ -94,7 +94,7 @@ Because noninformative reads can belong to more than one category, these numbers
| Restriction Enzyme(s) | {lib_enzyme} | N/A
</center>

<div class="small center">
<div class="small center" markdown="1">
<br />
*The average number of usable high-quality read pairs per contig, for contigs greater than 5kb. Read pairs are "usable" if they map (a) with high-quality, (b) to different contigs, (c) where each of those contigs are greater than 5kb and (d) both mappings are high-quality.<br>
</div>
Expand Down
16 changes: 16 additions & 0 deletions sequana_pipelines/hic/src/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,22 @@ block_img {
clear: left;
}

table {
border-collapse: collapse;
margin: 12px auto;
font-size: 13px;
}

th, td {
border: 1px solid #cccccc;
padding: 4px 10px;
}

th {
background-color: #34495e;
color: white;
}

tr:nth-child(even) {
background-color: #DDDDDD;
}
Expand Down
72 changes: 72 additions & 0 deletions test/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import subprocess
import tempfile

import yaml
from click.testing import CliRunner

from sequana_pipelines.hic.main import main
Expand All @@ -12,6 +13,23 @@
reference = f"{sharedir}/chr1.fa"


def _run(tmp_path, *extra):
runner = CliRunner()
return runner.invoke(
main,
[
"--input-directory",
sharedir,
"--reference-file",
reference,
"--working-directory",
str(tmp_path / "wk"),
"--force",
*extra,
],
)


def test_standalone_script(tmp_path):
runner = CliRunner()
results = runner.invoke(
Expand Down Expand Up @@ -48,6 +66,60 @@ def test_standalone_script_bwa_split(tmp_path):
assert results.exit_code == 0


def test_standalone_script_chromap(tmp_path):
results = _run(tmp_path, "--aligner-choice", "chromap")
assert results.exit_code == 0


def test_invalid_aligner_fails(tmp_path):
results = _run(tmp_path, "--aligner-choice", "notanaligner")
assert results.exit_code != 0


def test_missing_reference_fails(tmp_path):
runner = CliRunner()
results = runner.invoke(
main,
[
"--input-directory",
sharedir,
"--working-directory",
str(tmp_path / "wk"),
"--force",
],
)
assert results.exit_code != 0


def test_setup_artifacts(tmp_path):
wk = tmp_path / "wk"
results = _run(tmp_path)
assert results.exit_code == 0
# launcher and config produced
assert (wk / "hic.sh").exists()
assert (wk / "config.yaml").exists()
# custom files copied into .sequana by main()
for name in ("hic_qc.py", "hic_qc_style.css", "report_template.md"):
assert (wk / ".sequana" / name).exists()


def test_config_reflects_options(tmp_path):
wk = tmp_path / "wk"
results = _run(tmp_path, "--aligner-choice", "chromap", "--enable-fastp")
assert results.exit_code == 0
cfg = yaml.safe_load((wk / "config.yaml").read_text())
assert cfg["general"]["mapper"] == "chromap"
assert cfg["fastp"]["do"] is True


def test_config_fastp_disabled_by_default(tmp_path):
wk = tmp_path / "wk"
results = _run(tmp_path)
assert results.exit_code == 0
cfg = yaml.safe_load((wk / "config.yaml").read_text())
assert cfg["fastp"]["do"] is False


def test_version():
runner = CliRunner()
results = runner.invoke(main, ["--version"])
Expand Down
Loading