Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions config/config_CD19_BBz.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Long-read data
experiment: "BBz"
experiment: "annotation_test_cmod"
samples:
MK028_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK028_BBz_pooled.bam"
MK014_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK014_BBz_pooled.bam"
Expand Down Expand Up @@ -27,26 +27,28 @@ ref_genome_ctrl: "/home/weichan/permanent/Projects/VIS/Data/VIS_Magdeburg_withBa
#ucsc_H3K27Ac: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K27Ac_02_24"
#ucsc_H3K4Me3: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me3_02_24"
#ucsc_DNaseH: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_DNaseClusters_02_24_cut"
ucsc_TF: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed"
annotate_ucsc_tf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed"
#annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODEV44_processed.bed"
annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed"
annotate_gencode: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed"
#ucsc_Genes_gtf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/gencode.v44.annotation.gtf"
#ucsc exons
ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons"
ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
annotate_ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons"
annotate_ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
#promoters
ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
annotate_ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
#enhancer
annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
#sedb
#sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed"
#sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed"
#tss
#ucsc_tss: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TSS_peaks_processed_sorted.bed"
#miRNA
#ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed"
annotate_ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed"
#cancer genes
cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
annotate_cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
#hiC
encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
annotate_encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
#VIS detection
detection: "rules/detection.smk"
#qc rule collection
Expand All @@ -61,3 +63,6 @@ downstream: "rules/downstream.smk"
#epigenetics: "rules/epigenetics.smk"
#variants rule collection WIP
#variants: "rules/variants.smk"
base_modifications: "rules/base_modifications.smk"
plot_functional_genomics: "rules/plot_functional_genomics.smk"
threads: 10
63 changes: 63 additions & 0 deletions config/config_Clone1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Long-read data
experiment: "Clone1_cmod_28z"
samples:
Clone1: "/home/weichan/permanent/Projects/VIS/Data/CAR_clonality/combined_first_clone_sup.bam"
processing_dir: "/home/weichan/temporary/Projects/VIS_out"
#source_dir: "Src/"
insertion_fasta: "/home/weichan/permanent/Projects/VIS/Data/pSLCAR-CD19-28z/pSLCAR-CD19-28z.fa"
blastn_db: "/home/weichan/blastNdb/blastNdb" #human nucleotides to see which parts of the human genome match the CAR construct
# protein BLASTp database
#proteindb: "/home/weichan/swissprot/swissprot"
splitmode: "Buffer" #"Buffer", "Join", "Separated"
fragment_size: 100 #input for custom function
bridging_size: 300 #amount of bases that are bridged if fragments do not match between other fragments
MinLength: 1
MAPQ: 20 #mapping quality filter after pseudo-read generation
MinInsertionLength: 500 #212 eef1a length,cd247 1600nt. 1182 in car123 maybe 1200 with buffer?
ref_genome_ctrl: "/home/weichan/permanent/Projects/VIS/Data/VIS_Magdeburg_withBasecalling/hg38.fa"
#ref_genome_ctrl: "/home/weichan/permanent/Src/T2T_ref/chm13v2.0.fa"
#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/CD123/hg38_CD123.fasta" #might still contain empty last row!
#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/VIS_Magdeburg_withBasecalling/hg38_CD19.fasta"
#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/pSLCAR-CD19-28z/hg38_CD19_28z.fa"
#ucsc_repeats: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_RepeatMasker_02_24"
#all bed files need to be sorted and without the header
#ucsc_H3K4Me1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me1_02_24"
#ucsc_H3K27Ac: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K27Ac_02_24"
#ucsc_H3K4Me3: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me3_02_24"
#ucsc_DNaseH: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_DNaseClusters_02_24_cut"
ucsc_TF: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed"
#annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODEV44_processed.bed"
annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed"
#ucsc_Genes_gtf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/gencode.v44.annotation.gtf"
#ucsc exons
ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons"
ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
#promoters
ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
#sedb
#sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed"
#sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed"
#tss
#ucsc_tss: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TSS_peaks_processed_sorted.bed"
#miRNA
#ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed"
#cancer genes
cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
#hiC
encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
#VIS detection
detection: "rules/detection.smk"
#qc rule collection
quality_control: "rules/qc.smk"
#functional genomics rule collection
functional_genomics: "rules/functional_genomics.smk"
# downstream analysis rules; not as formally strict
downstream: "rules/downstream.smk"
#rules that are still under development WIP
#development: "rules/development.smk"
#epigenetics rule collection WIP
#epigenetics: "rules/epigenetics.smk"
#variants rule collection WIP
#variants: "rules/variants.smk"
cmod: "rules/cmod.smk"
threads: 10
51 changes: 32 additions & 19 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,45 +12,58 @@ SAMPLES = expand(config["samples"])
outdir = os.path.join(config["processing_dir"],str(config["experiment"]))
fragmentsize=config["fragment_size"]

# needs to be improved somehow. Maybe even by removing the global import and replacing every single fucntion with a local import line
#local functions - path to helper fucntions needs to be added to the sys path, otherwise import won't find the file
rootpath = os.path.join("workflow/scripts")
sys.path.append(rootpath)
cwd=os.getcwd()

import VIS_helper_functions as vhf #custom functions to make snakemake pipeline leaner

#inmport rules
#inmport default rules
include: config["detection"]
include: config["functional_genomics"]
include: config["quality_control"]

#conditional rule all based on defined rules
conditional_output = list()

#analysis-specific rules
#include: config["downstream"]
if "functional_genomics" in config:
import VIS_functional_genomics_helper_functions as vhf_fg
include: config["functional_genomics"]
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES))
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES))
conditional_output.append(expand(f"{outdir}/intermediate/localization/annotation/Annotation_{{annotation}}_Insertions_{{sample}}.bed",
annotation=[k.replace("annotate_", "") for k in config if k.startswith("annotate_")],
sample=SAMPLES))

#development
#include: config["epigenetics"]
#include: config["variants"]
#include: config["development"]
if "plot_functional_genomics" in config:
import VIS_plot_functional_genomics_helper_functions as vhf_pfg
include: config["plot_functional_genomics"]
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES))
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES))

if "base_modifications" in config:
include: config["base_modifications"]
conditional_output.append(expand(f"{outdir}/final/base_modifications/Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))
conditional_output.append(expand(f"{outdir}/final/base_modifications/Calls_Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))

#target rule
rule all:
input:
input:
#detection
expand(f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed", sample=SAMPLES),
f"{outdir}/final/localization/Heatmap_Insertion_Chr.png",
f"{outdir}/final/localization/Insertion_length.png",

#functional genomics
expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES),
expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES),
expand(f"{outdir}/intermediate/localization/annotation/Annotation_gene_Insertions_{{sample}}.bed", sample=SAMPLES),
expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES),
#quality control
expand(f"{outdir}/final/qc/mapq/{{sample}}_mapq_heatmap_image.png", sample=SAMPLES),
expand(f"{outdir}/final/qc/Fragmentation/Insertions/insertions_{fragmentsize}_{{sample}}", sample=SAMPLES),
expand(f"{outdir}/final/qc/Fragmentation/Longest_Interval/{{sample}}/", sample=SAMPLES),
f"{outdir}/final/qc/multiqc_report.html",
# process
f"{outdir}/config_settings.yml",

# downstream
# other output
conditional_output
# downstream
#expand(f"{outdir}/intermediate/blastn/Filtered_Annotated_{fragmentsize}_InsertionMatches_{{sample}}.gff", sample=SAMPLES),
#expand(f"{outdir}/final/functional_genomics/localization/{fragmentsize}_{{sample}}", sample=SAMPLES),

# for msa
#expand(f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa", sample=SAMPLES)
7 changes: 7 additions & 0 deletions workflow/envs/VIS_modkit_env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: VIS_modkit_env
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- ont-modkit
111 changes: 111 additions & 0 deletions workflow/rules/base_modifications.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
######
######
###### Base modifications of reads with isnertions
######
######

# Notes: Methylartist works in principle, but there is one very interesting problem: SInce the Insertion are usually cut out from the optimal alignment, their modifications cannot be seen anymore.
# This means that it is not possible to have an idea about what is going on at the exact insertiopn if we use the common tools! -> this makes manual labor necessary

rule readnames_final:
input:
final_insertions=f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed"
log:
log=f"{outdir}/intermediate/log/base_modifications/readnames_final/{{sample}}.log"
output:
readnames=temp(f"{outdir}/intermediate/base_modifications/final_insertion_readnames_{{sample}}.txt")
conda:
"../envs/VIS_dummy_env.yml"
shell:
"""
(
cut {input} -f 4 > {output}
) > {log.log} 2>&1
"""

rule only_keep_filtered_insertions:
input:
isobam=f"{outdir}/intermediate/mapping/Precut_{{sample}}_sorted.bam",
readnames=f"{outdir}/intermediate/cmod/final_insertion_readnames_{{sample}}.txt"
log:
log=f"{outdir}/intermediate/log/base_modifications/only_keep_filtered_insertions/{{sample}}.log"
output:
bam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam"
conda:
"../envs/VIS_samtools_env.yml"
shell:
"""
(
samtools view -h -b -N {input.readnames} {input.isobam} | samtools sort > {output.bam}
samtools index {output.bam}
) > {log.log} 2>&1
"""

rule modkit:
input:
isobam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam"
log:
log=f"{outdir}/intermediate/log/base_modifications/modkit/{{sample}}.log"
output:
tsv=f"{outdir}/final/base_modifications/Isolated_Reads_{{sample}}.tsv"
conda:
"../envs/VIS_modkit_env.yml"
threads: config["threads"]
shell:
"""
(
modkit extract full -t {threads} {input.isobam} {output.tsv}
) > {log.log} 2>&1
"""

rule call_modkit:
input:
isobam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam"
log:
log=f"{outdir}/intermediate/log/base_modifications/call_modkit/{{sample}}.log"
output:
tsv=f"{outdir}/final/base_modifications/Calls_Isolated_Reads_{{sample}}.tsv"
conda:
"../envs/VIS_modkit_env.yml"
threads: config["threads"]
shell:
"""
(
modkit extract calls -t {threads} {input.isobam} {output.tsv}
) > {log.log} 2>&1
"""

'''
rule specific_methylartist:
input:
bam=f"{outdir}/intermediate/mapping/Precut_{{sample}}_sorted.bam",
ref=f"{outdir}/intermediate/mapping/insertion_ref_genome.fa" #must be indexed!
log:
log=f"{outdir}/intermediate/log/cmod/specific_methylartist/{{sample}}.log"
output:
plot=f"{outdir}/intermediate/cmod/Region3_Reads_{{sample}}.png"
conda:
"../envs/VIS_methylartist_env.yml"
shell:
"""
(
methylartist locus -b {input.bam} -i chr17:31124037-31180287 -n C -r {input.ref} -l 31154037-31159287 -o {output.plot}
) > {log.log} 2>&1
"""


rule inserted_seq:
input:
insertion=f"{outdir}/intermediate/fasta/Isolated_Reads_{{sample}}.fa",
coordinates=f"{outdir}/intermediate/blastn/Coordinates_{fragmentsize}_InsertionMatches_{{sample}}.blastn"
log:
log=f"{outdir}/intermediate/log/cmod/inserted_seq/{{sample}}.log"
output:
fasta=f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa"
run:
try:
vhf.get_inserted_fasta_seq(input.insertion, input.coordinates, output[0], log.log)
except Exception as e:
with open(log.log, "a") as log_file:
log_file.write(f"Error: {str(e)}\n")
'''
Loading