diff --git a/config/config_CD19_BBz.yml b/config/config_CD19_BBz.yml index de193db..6f53d98 100644 --- a/config/config_CD19_BBz.yml +++ b/config/config_CD19_BBz.yml @@ -1,5 +1,5 @@ # Long-read data -experiment: "BBz" +experiment: "annotation_test_cmod" samples: MK028_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK028_BBz_pooled.bam" MK014_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK014_BBz_pooled.bam" @@ -27,26 +27,28 @@ ref_genome_ctrl: "/home/weichan/permanent/Projects/VIS/Data/VIS_Magdeburg_withBa #ucsc_H3K27Ac: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K27Ac_02_24" #ucsc_H3K4Me3: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me3_02_24" #ucsc_DNaseH: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_DNaseClusters_02_24_cut" -ucsc_TF: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed" +annotate_ucsc_tf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed" #annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODEV44_processed.bed" -annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed" +annotate_gencode: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed" #ucsc_Genes_gtf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/gencode.v44.annotation.gtf" #ucsc exons -ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons" -ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns" +annotate_ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons" +annotate_ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns" #promoters -ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew" +annotate_ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew" +#enhancer +annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed" #sedb #sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed" #sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed" #tss #ucsc_tss: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TSS_peaks_processed_sorted.bed" #miRNA -#ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed" +annotate_ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed" #cancer genes -cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed" +annotate_cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed" #hiC -encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed" +annotate_encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed" #VIS detection detection: "rules/detection.smk" #qc rule collection @@ -61,3 +63,6 @@ downstream: "rules/downstream.smk" #epigenetics: "rules/epigenetics.smk" #variants rule collection WIP #variants: "rules/variants.smk" +base_modifications: "rules/base_modifications.smk" +plot_functional_genomics: "rules/plot_functional_genomics.smk" +threads: 10 diff --git a/config/config_Clone1.yml b/config/config_Clone1.yml new file mode 100644 index 0000000..344c628 --- /dev/null +++ b/config/config_Clone1.yml @@ -0,0 +1,63 @@ +# Long-read data +experiment: "Clone1_cmod_28z" +samples: + Clone1: "/home/weichan/permanent/Projects/VIS/Data/CAR_clonality/combined_first_clone_sup.bam" +processing_dir: "/home/weichan/temporary/Projects/VIS_out" +#source_dir: "Src/" +insertion_fasta: "/home/weichan/permanent/Projects/VIS/Data/pSLCAR-CD19-28z/pSLCAR-CD19-28z.fa" +blastn_db: "/home/weichan/blastNdb/blastNdb" #human nucleotides to see which parts of the human genome match the CAR construct +# protein BLASTp database +#proteindb: "/home/weichan/swissprot/swissprot" +splitmode: "Buffer" #"Buffer", "Join", "Separated" +fragment_size: 100 #input for custom function +bridging_size: 300 #amount of bases that are bridged if fragments do not match between other fragments +MinLength: 1 +MAPQ: 20 #mapping quality filter after pseudo-read generation +MinInsertionLength: 500 #212 eef1a length,cd247 1600nt. 1182 in car123 maybe 1200 with buffer? +ref_genome_ctrl: "/home/weichan/permanent/Projects/VIS/Data/VIS_Magdeburg_withBasecalling/hg38.fa" +#ref_genome_ctrl: "/home/weichan/permanent/Src/T2T_ref/chm13v2.0.fa" +#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/CD123/hg38_CD123.fasta" #might still contain empty last row! +#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/VIS_Magdeburg_withBasecalling/hg38_CD19.fasta" +#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/pSLCAR-CD19-28z/hg38_CD19_28z.fa" +#ucsc_repeats: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_RepeatMasker_02_24" +#all bed files need to be sorted and without the header +#ucsc_H3K4Me1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me1_02_24" +#ucsc_H3K27Ac: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K27Ac_02_24" +#ucsc_H3K4Me3: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me3_02_24" +#ucsc_DNaseH: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_DNaseClusters_02_24_cut" +ucsc_TF: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed" +#annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODEV44_processed.bed" +annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed" +#ucsc_Genes_gtf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/gencode.v44.annotation.gtf" +#ucsc exons +ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons" +ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns" +#promoters +ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew" +#sedb +#sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed" +#sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed" +#tss +#ucsc_tss: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TSS_peaks_processed_sorted.bed" +#miRNA +#ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed" +#cancer genes +cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed" +#hiC +encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed" +#VIS detection +detection: "rules/detection.smk" +#qc rule collection +quality_control: "rules/qc.smk" +#functional genomics rule collection +functional_genomics: "rules/functional_genomics.smk" +# downstream analysis rules; not as formally strict +downstream: "rules/downstream.smk" +#rules that are still under development WIP +#development: "rules/development.smk" +#epigenetics rule collection WIP +#epigenetics: "rules/epigenetics.smk" +#variants rule collection WIP +#variants: "rules/variants.smk" +cmod: "rules/cmod.smk" +threads: 10 \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 577b1fa..3b791d2 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -12,36 +12,47 @@ SAMPLES = expand(config["samples"]) outdir = os.path.join(config["processing_dir"],str(config["experiment"])) fragmentsize=config["fragment_size"] +# needs to be improved somehow. Maybe even by removing the global import and replacing every single fucntion with a local import line +#local functions - path to helper fucntions needs to be added to the sys path, otherwise import won't find the file +rootpath = os.path.join("workflow/scripts") +sys.path.append(rootpath) +cwd=os.getcwd() + import VIS_helper_functions as vhf #custom functions to make snakemake pipeline leaner -#inmport rules +#inmport default rules include: config["detection"] -include: config["functional_genomics"] include: config["quality_control"] +#conditional rule all based on defined rules +conditional_output = list() -#analysis-specific rules -#include: config["downstream"] +if "functional_genomics" in config: + import VIS_functional_genomics_helper_functions as vhf_fg + include: config["functional_genomics"] + conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES)) + conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES)) + conditional_output.append(expand(f"{outdir}/intermediate/localization/annotation/Annotation_{{annotation}}_Insertions_{{sample}}.bed", + annotation=[k.replace("annotate_", "") for k in config if k.startswith("annotate_")], + sample=SAMPLES)) -#development -#include: config["epigenetics"] -#include: config["variants"] -#include: config["development"] +if "plot_functional_genomics" in config: + import VIS_plot_functional_genomics_helper_functions as vhf_pfg + include: config["plot_functional_genomics"] + conditional_output.append(expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES)) + conditional_output.append(expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES)) +if "base_modifications" in config: + include: config["base_modifications"] + conditional_output.append(expand(f"{outdir}/final/base_modifications/Isolated_Reads_{{sample}}.tsv", sample=SAMPLES)) + conditional_output.append(expand(f"{outdir}/final/base_modifications/Calls_Isolated_Reads_{{sample}}.tsv", sample=SAMPLES)) -#target rule rule all: - input: + input: #detection expand(f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed", sample=SAMPLES), f"{outdir}/final/localization/Heatmap_Insertion_Chr.png", f"{outdir}/final/localization/Insertion_length.png", - - #functional genomics - expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES), - expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES), - expand(f"{outdir}/intermediate/localization/annotation/Annotation_gene_Insertions_{{sample}}.bed", sample=SAMPLES), - expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES), #quality control expand(f"{outdir}/final/qc/mapq/{{sample}}_mapq_heatmap_image.png", sample=SAMPLES), expand(f"{outdir}/final/qc/Fragmentation/Insertions/insertions_{fragmentsize}_{{sample}}", sample=SAMPLES), @@ -49,8 +60,10 @@ rule all: f"{outdir}/final/qc/multiqc_report.html", # process f"{outdir}/config_settings.yml", - - # downstream + # other output + conditional_output + # downstream #expand(f"{outdir}/intermediate/blastn/Filtered_Annotated_{fragmentsize}_InsertionMatches_{{sample}}.gff", sample=SAMPLES), #expand(f"{outdir}/final/functional_genomics/localization/{fragmentsize}_{{sample}}", sample=SAMPLES), - + # for msa + #expand(f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa", sample=SAMPLES) \ No newline at end of file diff --git a/workflow/envs/VIS_modkit_env.yml b/workflow/envs/VIS_modkit_env.yml new file mode 100644 index 0000000..62c4a4d --- /dev/null +++ b/workflow/envs/VIS_modkit_env.yml @@ -0,0 +1,7 @@ +name: VIS_modkit_env +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - ont-modkit diff --git a/workflow/rules/base_modifications.smk b/workflow/rules/base_modifications.smk new file mode 100644 index 0000000..1df65f4 --- /dev/null +++ b/workflow/rules/base_modifications.smk @@ -0,0 +1,111 @@ +###### +###### +###### Base modifications of reads with isnertions +###### +###### + +# Notes: Methylartist works in principle, but there is one very interesting problem: SInce the Insertion are usually cut out from the optimal alignment, their modifications cannot be seen anymore. +# This means that it is not possible to have an idea about what is going on at the exact insertiopn if we use the common tools! -> this makes manual labor necessary + +rule readnames_final: + input: + final_insertions=f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed" + log: + log=f"{outdir}/intermediate/log/base_modifications/readnames_final/{{sample}}.log" + output: + readnames=temp(f"{outdir}/intermediate/base_modifications/final_insertion_readnames_{{sample}}.txt") + conda: + "../envs/VIS_dummy_env.yml" + shell: + """ + ( + cut {input} -f 4 > {output} + ) > {log.log} 2>&1 + """ + +rule only_keep_filtered_insertions: + input: + isobam=f"{outdir}/intermediate/mapping/Precut_{{sample}}_sorted.bam", + readnames=f"{outdir}/intermediate/cmod/final_insertion_readnames_{{sample}}.txt" + log: + log=f"{outdir}/intermediate/log/base_modifications/only_keep_filtered_insertions/{{sample}}.log" + output: + bam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam" + conda: + "../envs/VIS_samtools_env.yml" + shell: + """ + ( + samtools view -h -b -N {input.readnames} {input.isobam} | samtools sort > {output.bam} + samtools index {output.bam} + ) > {log.log} 2>&1 + """ + +rule modkit: + input: + isobam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam" + log: + log=f"{outdir}/intermediate/log/base_modifications/modkit/{{sample}}.log" + output: + tsv=f"{outdir}/final/base_modifications/Isolated_Reads_{{sample}}.tsv" + conda: + "../envs/VIS_modkit_env.yml" + threads: config["threads"] + shell: + """ + ( + modkit extract full -t {threads} {input.isobam} {output.tsv} + ) > {log.log} 2>&1 + """ + +rule call_modkit: + input: + isobam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam" + log: + log=f"{outdir}/intermediate/log/base_modifications/call_modkit/{{sample}}.log" + output: + tsv=f"{outdir}/final/base_modifications/Calls_Isolated_Reads_{{sample}}.tsv" + conda: + "../envs/VIS_modkit_env.yml" + threads: config["threads"] + shell: + """ + ( + modkit extract calls -t {threads} {input.isobam} {output.tsv} + ) > {log.log} 2>&1 + """ + +''' +rule specific_methylartist: + input: + bam=f"{outdir}/intermediate/mapping/Precut_{{sample}}_sorted.bam", + ref=f"{outdir}/intermediate/mapping/insertion_ref_genome.fa" #must be indexed! + log: + log=f"{outdir}/intermediate/log/cmod/specific_methylartist/{{sample}}.log" + output: + plot=f"{outdir}/intermediate/cmod/Region3_Reads_{{sample}}.png" + conda: + "../envs/VIS_methylartist_env.yml" + shell: + """ + ( + methylartist locus -b {input.bam} -i chr17:31124037-31180287 -n C -r {input.ref} -l 31154037-31159287 -o {output.plot} + ) > {log.log} 2>&1 + """ + + +rule inserted_seq: + input: + insertion=f"{outdir}/intermediate/fasta/Isolated_Reads_{{sample}}.fa", + coordinates=f"{outdir}/intermediate/blastn/Coordinates_{fragmentsize}_InsertionMatches_{{sample}}.blastn" + log: + log=f"{outdir}/intermediate/log/cmod/inserted_seq/{{sample}}.log" + output: + fasta=f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa" + run: + try: + vhf.get_inserted_fasta_seq(input.insertion, input.coordinates, output[0], log.log) + except Exception as e: + with open(log.log, "a") as log_file: + log_file.write(f"Error: {str(e)}\n") +''' \ No newline at end of file diff --git a/workflow/rules/detection.smk b/workflow/rules/detection.smk index 2f60a5c..4aaa489 100644 --- a/workflow/rules/detection.smk +++ b/workflow/rules/detection.smk @@ -29,13 +29,13 @@ rule build_insertion_reference: log: log=f"{outdir}/intermediate/log/detection/build_insertion_reference/out.log" output: - temp(f"{outdir}/intermediate/mapping/insertion_ref_genome.fa") + f"{outdir}/intermediate/mapping/insertion_ref_genome.fa" conda: "../envs/VIS_dummy_env.yml" shell: """ ( - cat {input.ref} {input.insertion} > {output} + cat {input.ref} {input.insertion} | awk 'NF' > {output} ) > {log.log} 2>&1 """ @@ -60,7 +60,7 @@ rule minimap_index: rule make_fasta_without_tags: #fasta of raw data no trimming whatsoever input: - fq=lambda wildcards: config["samples"][wildcards.sample] + bam=lambda wildcards: config["samples"][wildcards.sample] log: log=f"{outdir}/intermediate/log/detection/make_fasta_without_tags/{{sample}}.log" output: @@ -68,11 +68,52 @@ rule make_fasta_without_tags: #fasta of raw data no trimming whatsoever conda: "../envs/VIS_samtools_env.yml" shell: + """ + ( + samtools fasta {input.bam} -o {output.fasta} > {output.fasta} + ) > {log.log} 2>&1 + """ + +###### +###### +###### Only use reads with insertions for speed +###### +###### + +rule insertion_reads: + input: + bam=lambda wildcards: config["samples"][wildcards.sample], + readnames=f"{outdir}/intermediate/blastn/Readnames_{fragmentsize}_InsertionMatches_{{sample}}.txt" + log: + log=f"{outdir}/intermediate/log/detection/insertion_reads_cmod/{{sample}}.log" + output: + isobam=f"{outdir}/intermediate/mapping/Isolated_Reads_{{sample}}.bam" + conda: + "../envs/VIS_samtools_env.yml" + shell: + """ + ( + samtools view -b -N {input.readnames} {input.bam} | samtools sort > {output.isobam} + samtools index {output.isobam} + ) > {log.log} 2>&1 + """ + +rule fasta_insertion_reads: + input: + f"{outdir}/intermediate/mapping/Isolated_Reads_{{sample}}.bam" + log: + log=f"{outdir}/intermediate/log/detection/fasta_insertion_reads_cmod/{{sample}}.log" + output: + f"{outdir}/intermediate/fasta/Isolated_Reads_{{sample}}.fa" + conda: + "../envs/VIS_samtools_env.yml" + shell: """ ( samtools fasta {input} -o {output} > {output} ) > {log.log} 2>&1 """ + ###### ###### ###### "Clean" BAM: Cut-out fasta to BAM via Mapping to reference @@ -90,19 +131,21 @@ rule Non_insertion_mapping: #mapping against the unaltered referenc egenome log=f"{outdir}/intermediate/log/detection/Non_insertion_mapping/{{sample}}.log" resources: mem_mb=5000 + threads: config["threads"] conda: "../envs/VIS_minimap_env.yml" shell: #N=0 instead of default N=1 """ ( - minimap2 -y -ax map-ont --score-N 0 {input.genome} {input.fasta} | samtools sort | samtools view -F 2304 -o {output} + minimap2 -t {threads} -y -ax map-ont --score-N 0 {input.genome} {input.fasta} | samtools sort | samtools view -F 2304 -o {output} samtools index {output} ) > {log.log} 2>&1 """ rule insertion_mapping: #conserves tags! input: - bam=lambda wildcards: config["samples"][wildcards.sample], + #bam=lambda wildcards: config["samples"][wildcards.sample], #full data + bam=f"{outdir}/intermediate/mapping/Isolated_Reads_{{sample}}.bam", #only reads with insertions minimapref=f"{outdir}/intermediate/mapping/ref_genome_index.mmi", ref=f"{outdir}/intermediate/mapping/insertion_ref_genome.fa" output: @@ -111,12 +154,13 @@ rule insertion_mapping: #conserves tags! log=f"{outdir}/intermediate/log/detection/insertion_mapping/{{sample}}.log" resources: mem_mb=5000 + threads: config["threads"] conda: "../envs/VIS_minimap_env.yml" shell: """ ( - samtools bam2fq -T '*' {input.bam}| minimap2 -y -ax map-ont {input.minimapref} - | samtools sort | samtools view -F 2304 -o {output} + samtools bam2fq -T '*' {input.bam}| minimap2 -t {threads} -y -ax map-ont {input.minimapref} - | samtools sort | samtools view -F 2304 -o {output} samtools index {output} ) > {log.log} 2>&1 """ @@ -196,7 +240,8 @@ rule get_coordinates_for_fasta: #filters and combines matches rule split_fasta: input: breakpoints=f"{outdir}/intermediate/blastn/Coordinates_{fragmentsize}_InsertionMatches_{{sample}}.blastn", - fasta=f"{outdir}/intermediate/fasta/Full_{{sample}}.fa" + #fasta=f"{outdir}/intermediate/fasta/Full_{{sample}}.fa" + fasta=f"{outdir}/intermediate/fasta/Isolated_Reads_{{sample}}.fa" params: mode=config["splitmode"] #if each split fasta substring should be used individually, use "Separated" Join, New mode: Buffer log: @@ -216,6 +261,7 @@ rule split_fasta: ###### Insertion preparation: Fragmentation ###### ###### + rule prepare_insertion: input: config["insertion_fasta"] #insertion fasta sequence @@ -287,6 +333,8 @@ rule find_insertion_BLASTn: log=f"{outdir}/intermediate/log/detection/find_insertion_BLASTn/{{sample}}.log" output: temp(f"{outdir}/intermediate/blastn/{fragmentsize}_InsertionMatches_{{sample}}.blastn") + threads: + config["threads"] conda: "../envs/VIS_blastn_env.yml" shell: @@ -295,6 +343,7 @@ rule find_insertion_BLASTn: mkdir {params.tempdir} blastn \ + -num_threads {threads} \ -query {input.fasta} \ -db {input.insertion} \ -out {params.tempdir}/temp_output.blastn \ @@ -315,6 +364,8 @@ rule find_insertion_BLASTn_in_Ref: fasta=f"{outdir}/intermediate/fasta/fragments/{fragmentsize}_Insertion_fragments.fa" params: refdb=config.get("blastn_db", "") # Optional blastn database path + threads: + config["threads"] log: log=f"{outdir}/intermediate/log/detection/find_insertion_BLASTn_in_Ref/{{sample}}.log" output: @@ -330,6 +381,7 @@ rule find_insertion_BLASTn_in_Ref: else # If blastn_db is provided, run the blastn command blastn \ + -num_threads {threads} \ -query {input.fasta} \ -db {params.refdb} \ -out {output} \ @@ -380,13 +432,12 @@ rule extract_by_length: ) > {log.log} 2>&1 """ + ###### ###### ###### Visualisation of insertions ###### - - rule basic_insertion_plots: input: expand(f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed", sample=SAMPLES) @@ -433,7 +484,23 @@ rule calculate_exact_insertion_coordinates: except Exception as e: with open(log.log, "a") as log_file: log_file.write(f"Error: {str(e)}\n") - + +rule insertion_points: + input: + f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed" + output: + f"{outdir}/final/localization/InsertionPoints_{{sample}}.bed" + log: + f"{outdir}/intermediate/log/detection/insertion_points/{{sample}}.log" + conda: + "../envs/VIS_dummy_env.yml" + shell: + """ + ( + awk '{{OFS="\t"; $3 = $2 + 1; print $0}}' {input} > {output} + ) > {log} 2>&1 + """ + rule collect_outputs: input: coordinates=f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed", diff --git a/workflow/rules/functional_genomics.smk b/workflow/rules/functional_genomics.smk index 855bc1f..4b11aee 100644 --- a/workflow/rules/functional_genomics.smk +++ b/workflow/rules/functional_genomics.smk @@ -2,128 +2,52 @@ #sort insertion file rule sort_insertion_file: - input: - exact=f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed", - point=f"{outdir}/intermediate/localization/annotation/temp_Insertions_{{sample}}.bed" - log: - log=f"{outdir}/intermediate/log/functional_genomics/sort_insertion_file/{{sample}}.log", - log2=f"{outdir}/intermediate/log/functional_genomics/sort_insertion_file/temp_{{sample}}.log" - output: - exact=temp(f"{outdir}/intermediate/localization/Sorted_ExactInsertions_{{sample}}.bed"), - point=f"{outdir}/intermediate/localization/temp_Sorted_ExactInsertions_{{sample}}.bed" - conda: - "../envs/VIS_dummy_env.yml" - shell: - """ - ( - sort -k1,1 -k2,2n {input.exact} > {output.exact} - ) > {log.log} 2>&1 - ( - sort -k1,1 -k2,2n {input.point} > {output.point} - ) > {log.log2} 2>&1 - """ + input: + exact=f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed", + point=f"{outdir}/final/localization/InsertionPoints_{{sample}}.bed" + log: + log=f"{outdir}/intermediate/log/functional_genomics/sort_insertion_file/{{sample}}.log", + log2=f"{outdir}/intermediate/log/functional_genomics/sort_insertion_file/points_{{sample}}.log" + output: + exact=temp(f"{outdir}/intermediate/localization/Sorted_ExactInsertions_{{sample}}.bed"), + point=f"{outdir}/intermediate/localization/Sorted_InsertionPoints_{{sample}}.bed" + conda: + "../envs/VIS_dummy_env.yml" + shell: + """ + ( + sort -k1,1 -k2,2n {input.exact} > {output.exact} + ) > {log.log} 2>&1 + ( + sort -k1,1 -k2,2n {input.point} > {output.point} + ) > {log.log2} 2>&1 + """ -### Distance of VIS to genes, TSS, miRNAs, and TF: Annotation of the insertions rule calc_distance_to_elements: input: #insertions=f"{outdir}/intermediate/localization/Sorted_ExactInsertions_{{sample}}.bed", - insertions=f"{outdir}/intermediate/localization/temp_Sorted_ExactInsertions_{{sample}}.bed", + insertions=f"{outdir}/intermediate/localization/Sorted_InsertionPoints_{{sample}}.bed", ref=config["ref_genome_ctrl"] params: - genes=config["ucsc_introns"], - tf=config.get("ucsc_TF",""), - tss=config.get("encode_hic", ""), - mirna=config.get("cosmic_genes", ""), - exons=config.get("ucsc_exons", "") + annotation_files={k: v for k, v in config.items() if k.startswith("annotate_")} log: log=f"{outdir}/intermediate/log/functional_genomics/calc_distance_to_elements/{{sample}}.log" output: f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed" run: - vhf.calculate_element_distance(input.insertions, output[0], log.log, params[0:5]) - -rule plot_distance_to_elements: - input: - distancetable=f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed" - params: - distances=list(range(-10000, 10001, 2000)), - threshold=10000 - log: - log1=f"{outdir}/intermediate/log/functional_genomics/plot_distance_to_elements/scatter_{{sample}}.log", - log2=f"{outdir}/intermediate/log/functional_genomics/plot_distance_to_elements/violin_{{sample}}.log" - output: - scatter=report(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png"), - run: - try: - vhf.plot_element_distance(input.distancetable, params.distances, params.threshold, output.scatter, log.log1) - except Exception as e: - with open(log.log1, "a") as log_file: - log_file.write(f"Error: {str(e)}\n") - -rule plot_scoring: + vhf.calculate_element_distance(input.insertions, output[0], log.log, params.annotation_files) + +#kinda 'hacky' solution for flexibility: Now it does not matter if one annotation or 20 are defined. +rule annotation_overlap_insertion: input: - f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed" + insertions_bed=f"{outdir}/intermediate/localization/Sorted_InsertionPoints_{{sample}}.bed" + params: + annotation_files={k.replace("annotate_", ""): v for k, v in config.items() if k.startswith("annotate_")} log: - log=f"{outdir}/intermediate/log/functional_genomics/plot_scoring/{{sample}}.log" + log=f"{outdir}/intermediate/log/functional_genomics/annotation_overlap_insertion/{{sample}}.log" output: - plot=report(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg"), - data=(f"{outdir}/final/functional_genomics/Insertion_Scoring_Data_{{sample}}.txt") + **{k.replace("annotate_", ""): f"{outdir}/intermediate/localization/annotation/Annotation_{k.replace('annotate_', '')}_Insertions_{{sample}}.bed" + for k in config if k.startswith("annotate_")} run: - try: - vhf.scoring_insertions(input[0], output.plot, output.data, log.log) - except Exception as e: - with open(log.log, "a") as log_file: - log_file.write(f"Error: {str(e)}\n") - -rule modify_insertions: - input: - f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed" - output: - temp(f"{outdir}/intermediate/localization/annotation/temp_Insertions_{{sample}}.bed") - log: - f"{outdir}/intermediate/log/functional_genomics/modify_insertions/{{sample}}.log" - conda: - "../envs/VIS_dummy_env.yml" - shell: - """ - ( - awk '{{OFS="\t"; $3 = $2 + 1; print $0}}' {input} > {output} - ) > {log} 2>&1 - """ - -rule annotation_overlap_insertion: - input: - f"{outdir}/intermediate/localization/temp_Sorted_ExactInsertions_{{sample}}.bed" - #f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed" - params: - exons=config.get("ucsc_exons"), - introns=config.get("ucsc_introns"), - promoter=config.get("ucsc_promoter"), - gene=config.get("annotation_1"), - log: - log=f"{outdir}/intermediate/log/functional_genomics/annotation_overlap_insertion/introns_{{sample}}.log", - log2=f"{outdir}/intermediate/log/functional_genomics/annotation_overlap_insertion/exons_{{sample}}.log", - log3=f"{outdir}/intermediate/log/functional_genomics/annotation_overlap_insertion/promoter_{{sample}}.log", - log4=f"{outdir}/intermediate/log/functional_genomics/annotation_overlap_insertion/gene_{{sample}}.log" - output: - introns=f"{outdir}/intermediate/localization/annotation/Annotation_introns_Insertions_{{sample}}.bed", - exons=f"{outdir}/intermediate/localization/annotation/Annotation_exons_Insertions_{{sample}}.bed", - promoter=f"{outdir}/intermediate/localization/annotation/Annotation_promoter_Insertions_{{sample}}.bed", - gene=f"{outdir}/intermediate/localization/annotation/Annotation_gene_Insertions_{{sample}}.bed" - conda: - "../envs/VIS_bedtools_env.yml" - shell: #1 base is enough - """ - ( - bedtools intersect -a {input} -b {params.introns} -wb > {output.introns} - ) > {log.log} 2>&1 - ( - bedtools intersect -a {input} -b {params.exons} -wb > {output.exons} - ) > {log.log2} 2>&1 - ( - bedtools intersect -a {input} -b {params.promoter} -wb > {output.promoter} - ) > {log.log3} 2>&1 - ( - bedtools intersect -a {input} -b {params.gene} -wb > {output.gene} - ) > {log.log4} 2>&1 - """ + vhf_fg.run_bedtools_intersect(input.insertions_bed, output, log.log, params.annotation_files) + diff --git a/workflow/rules/plot_functional_genomics.smk b/workflow/rules/plot_functional_genomics.smk new file mode 100644 index 0000000..02053ad --- /dev/null +++ b/workflow/rules/plot_functional_genomics.smk @@ -0,0 +1,32 @@ +rule plot_distance_to_elements: + input: + distancetable=f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed" + params: + distances=list(range(-10000, 10001, 2000)), + threshold=10000 + log: + log1=f"{outdir}/intermediate/log/functional_genomics/plot_distance_to_elements/scatter_{{sample}}.log", + log2=f"{outdir}/intermediate/log/functional_genomics/plot_distance_to_elements/violin_{{sample}}.log" + output: + scatter=report(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png"), + run: + try: + vhf_pfg.plot_element_distance(input.distancetable, params.distances, params.threshold, output.scatter, log.log1) + except Exception as e: + with open(log.log1, "a") as log_file: + log_file.write(f"Error: {str(e)}\n") + +rule plot_scoring: + input: + f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed" + log: + log=f"{outdir}/intermediate/log/functional_genomics/plot_scoring/{{sample}}.log" + output: + plot=report(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg"), + data=(f"{outdir}/final/functional_genomics/Insertion_Scoring_Data_{{sample}}.txt") + run: + try: + vhf_pfg.scoring_insertions(input[0], output.plot, output.data, log.log) + except Exception as e: + with open(log.log, "a") as log_file: + log_file.write(f"Error: {str(e)}\n") \ No newline at end of file diff --git a/workflow/scripts/VIS_functional_genomics_helper_functions.py b/workflow/scripts/VIS_functional_genomics_helper_functions.py new file mode 100644 index 0000000..025e41c --- /dev/null +++ b/workflow/scripts/VIS_functional_genomics_helper_functions.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +import pandas as pd +import pybedtools + +#wrapper +from VIS_helper_functions import redirect_logging + +#functional +@redirect_logging(logfile_param="logfile") +def calculate_element_distance(insertions_bed, output_bed, logfile, annotation_files): + """ + Calculates distances between insertion sites and genomic annotations using bedtools closest. + + Parameters: + - insertions_bed (str): Path to the BED file containing the insertion sites. + - output_bed (str): Path to save the output BED file. + - annotation_files (dict): Dict with config_entry as keys and pathways as values. + """ + + # At least one annotation input necessary + if not annotation_files: + raise ValueError("At least one annotation file must be provided.") + + # Create DataFrame for combined annotations + combined_df = pd.DataFrame() + + for tag, file in annotation_files.items(): + try: + df = pd.read_csv(file, sep="\t", header=None, usecols=[0, 1, 2, 3, 4, 5]) + if df.iloc[0, 0].startswith("chr"): + df["source"] = tag # Add source column + print(f"Loaded {tag}: {df.head()}") + combined_df = pd.concat([combined_df, df], ignore_index=True) + except: + print(f"Error reading {file})") + print("BED files are expected to follow BED6 format convention. If more than 6 columns are provided, the first 6 will be used.") + continue + + # Convert combined DataFrame bed object + combined_bed = pybedtools.BedTool.from_dataframe(combined_df) + sorted_annotations = combined_bed.sort() + + insertions = pybedtools.BedTool(insertions_bed) + + #bedtools closest operation + closest = insertions.closest(sorted_annotations, D="a", filenames=True) + + print(type(closest)) + print(closest) + closest.saveas(output_bed) + print(f"Distances calculated and saved to {output_bed}") + +@redirect_logging(logfile_param="logfile") +def run_bedtools_intersect(insertions_bed, output_files, logfile, annotations): + """ + Runs bedtools intersect for each annotation using pybedtools. + """ + insertions = pybedtools.BedTool(insertions_bed) + + for key, annotation_file in annotations.items(): + output_file = output_files[key] + annotation = pybedtools.BedTool(annotation_file) + + intersected = insertions.intersect(annotation, wb=True) + + intersected.saveas(output_file) + print(f"Completed: {key}") diff --git a/workflow/scripts/VIS_helper_functions.py b/workflow/scripts/VIS_helper_functions.py index 2ff9545..a2b9281 100755 --- a/workflow/scripts/VIS_helper_functions.py +++ b/workflow/scripts/VIS_helper_functions.py @@ -15,11 +15,7 @@ import matplotlib.pyplot as plt import collections import seaborn as sns -import pybedtools import json -import subprocess -#from Bio.Align.Applications import ClustalOmegaCommandline -from matplotlib.patches import Patch #wrapper from functools import wraps @@ -594,7 +590,6 @@ def plot_longest_interval(matches, longest_start, longest_end, longest_subject_i rotation=90 ) - #invisible y axis plt.box(False) plt1 = plt.gca() @@ -645,318 +640,6 @@ def find_and_plot_longest_blast_interval(blastn, buffer, threshold, output_dir, else: print(f"Not enough matches for {query_id}.") -#functional -@redirect_logging(logfile_param="logfile") -def calculate_element_distance(insertions_bed, output_bed, logfile, annotation_files): - """ - Calculates distances between insertion sites and genomic annotations using bedtools closest. - - Parameters: - - insertions_bed (str): Path to the BED file containing the insertion sites. - - output_bed (str): Path to save the output BED file. - - annotation_files (str): List of paths to annotation BED files. - """ - - # At least one annotation input necessary - if not annotation_files: - raise ValueError("At least one annotation file must be provided.") - # Create DataFrame for combined annotations - combined_df = pd.DataFrame() - - for file in annotation_files: - try: - tag = os.path.basename(file).split(".")[0] # Use file name without extension as tag - df = pd.read_csv(file, sep="\t", header=None) # Load as DataFrame - df["source"] = tag # Add source column - combined_df = pd.concat([combined_df, df], ignore_index=True) - except: - continue - - # Convert combined DataFrame bed object - combined_bed = pybedtools.BedTool.from_dataframe(combined_df) - sorted_annotations = combined_bed.sort() - - insertions = pybedtools.BedTool(insertions_bed) - #bedtools closest operation - closest = insertions.closest(sorted_annotations, D="a", filenames=True) - - print(type(closest)) - print(closest) - # Convert BedTool output to DataFrame - closest_df = closest.to_dataframe( - names=[ - "InsertionChromosome", - "InsertionStart", - "InsertionEnd", - "InsertionRead", - "InsertionOrig", - "InsertionOrig2", - "InsertionStrand", - "AnnotationChromosome", - "AnnotationStart", - "AnnotationEnd", - "AnnotationID", - "AnnotationScore", - "AnnotationStrand", - "AnnotationSource", - "Distance", - ] - ) - - print(closest_df.head()) - closest_df = closest_df.drop(columns=["InsertionOrig","InsertionOrig2"]) - print(closest_df.head()) - # Save DataFrame to a file with headers - closest_df.to_csv(output_bed, sep="\t", index=False) - print(f"Distances calculated and saved to {output_bed}") - -@redirect_logging(logfile_param="logfile") -def plot_element_distance(bed, distances, distance_threshold, output_path, logfile): - """ - Uses the bed file from the distance calculations and provides a plot to visualize the respective elements with their distance. Entries that are further away than the defined threshold value are excluded. - """ - # Read the table - df = pd.read_csv( - bed, - sep='\t', - ) - - print(df.head()) - # Apply threshold filtering if provided - if distance_threshold is not None: - df = df[df['Distance'].abs() <= int(distance_threshold)] - - # Ensure absolute distance and sort by absolute distance within groups - df['abs_distance'] = df['Distance'].abs() - df = df.sort_values(by=['InsertionRead', 'abs_distance']).drop_duplicates(subset=['InsertionRead', 'AnnotationID', "AnnotationSource"], keep='first').reset_index() - - # Create scatter plot - sns.scatterplot( - data=df, - x='Distance', - y='AnnotationID', - hue='InsertionRead', - palette='tab10', - s=100, - style='AnnotationSource' - ) - - # Binned rugplot for distances - bin_size = 100 # Bin size grouping distances - df['distance_bin'] = (df['Distance'] // bin_size) * bin_size - sns.rugplot(x=df['distance_bin'], color='black', height=0.05, linewidth=1) - - # Configure plot aesthetics - plt.xticks(sorted({x for n in distances for x in (n, -n)}), rotation=45) - plt.xlabel("Distance (bp)") - plt.ylabel("Element Name") - plt.title("Distance Distribution to Elements") - sns.despine() - plt.legend(title="", fontsize=8) - - # Save plot - plt.savefig(output_path, dpi=300) - plt.close() - print(f"Plot saved to {output_path}") - -@redirect_logging(logfile_param="logfile") -def plot_element_distance_violin(bed, distances, distance_threshold, output_path, logfile): - # Read the table - df = pd.read_csv( - bed, - sep='\t', - ) - - # Apply threshold filtering if provided - if distance_threshold is not None: - df = df[df['Distance'].abs() <= int(distance_threshold)] - - # Create a count of how many times each source appears at each distance - distance_counts = df.groupby(['Distance', 'AnnotationSource', 'InsertionRead']).size().reset_index(name='count') - - # Create the bar plot - print(distance_counts.head()) - plt.figure(figsize=(10, 6)) - sns.displot( - data=distance_counts, - x='Distance', y='count', hue='InsertionRead', col='AnnotationSource', - palette='Set2' - ) - - # Customize the plot - plt.xticks(rotation=45) - plt.xlabel("Distance (bp)") - plt.ylabel("Count of Sources") - plt.title("Distribution of Sources at Different Distances") - sns.despine() - - # Save the plot - plt.tight_layout() # To ensure everything fits without overlap - plt.savefig(output_path, dpi=300) - plt.close() - - print(f"Plot saved to {output_path}") - - -@redirect_logging(logfile_param="logfile") -def scoring_insertions(data, output_plot, output_file, logfile): - """ - Uses custom conditions to visualize the entries of the annotated insertion summary table. - """ - colnames = names=[ - "InsertionChromosome", - "InsertionStart", - "InsertionEnd", - "InsertionRead", - "InsertionStrand", - "AnnotationChromosome", - "AnnotationStart", - "AnnotationEnd", - "AnnotationID", - "AnnotationScore", - "AnnotationStrand", - "AnnotationSource", - "Distance", - ] - df = pd.read_csv( - data, - sep='\t', - ) - - # Drop duplicate entries - df = df.drop_duplicates().reset_index(drop=True) - - # Only distance = 0 matters here - #df = df[df["Distance"] == 0] - - # Define conditions - conditions = [ - ("Cosmic", 0), ("TF", 0), ("Intron", 0), ("HiC", 0), ("Exon", 0), ("Promoter", 0) - ] - - for source, distance in conditions: - df[f"{source}_0"] = df["AnnotationSource"].str.contains(source) & (df["Distance"] == 0) - - # Aggregate data - heatmap_data = df.groupby(["InsertionRead", "InsertionChromosome", "InsertionStart", "InsertionEnd"]).agg("sum").reset_index() - - # Select numeric columns - heatmap_matrix = heatmap_data.drop(columns=colnames) - heatmap_matrix.index = heatmap_data["InsertionChromosome"] + "_" + \ - heatmap_data["InsertionStart"].astype(str) + "_" + \ - heatmap_data["InsertionEnd"].astype(str) - - - # Calculate Final Score - def calculate_score(row): - if row.loc["Intron_0"] < 1 and row.loc["Exon_0"] < 1 and row.loc["Promoter_0"] < 1: - if row.loc["TF_0"] < 1 and row.loc["HiC_0"] < 1: - return "0" - else: - return "1" - elif row.loc["Intron_0"] >= 1 and row.loc["Exon_0"] < 1: - if row.loc["TF_0"] > 1 or row.loc["HiC_0"] > 1: - if row.loc["Cosmic_0"] < 1: - return "2" - else: - return "4" - else: - if row.loc["Cosmic_0"] < 1: - return "1" - else: - return "4" - elif row.loc["Exon_0"] >= 1: - if row.loc["TF_0"] > 1 or row.loc["HiC_0"] > 1: - if row.loc["Cosmic_0"] < 1: - return "3" - else: - return "4" - else: - if row.loc["Cosmic_0"] < 1: - return "2" - else: - return "4" - elif row.loc["Promoter_0"] >= 1: - if row.loc["TF_0"] > 1 or row.loc["HiC_0"] > 1: - if row.loc["Cosmic_0"] < 1: - return "3" - else: - return "4" - else: - if row.loc["Cosmic_0"] < 1: - return "3" - else: - return "4" - else: - return "Undefined" - - heatmap_matrix["Risk"] = heatmap_matrix.apply(calculate_score, axis=1) - - # Map scores to colors - score_colors = { - "4": "red", - "3": "orange", - "2": "yellow", - "1": "lightgrey", - "0": "green" - } - row_colors = heatmap_matrix["Risk"].map(score_colors) - - # Prepare row color map - row_color_cmap = pd.DataFrame({ - "Risk": row_colors - }) - - #pretty names - heatmap_matrix.columns = ["Cancer gene", "TF", "Intron", "HiC", "Exon", "Promoter", "Risk"] - - # Plot clustermap with annotated row colors and custom colormap - cluster_grid = sns.clustermap( - heatmap_matrix.drop(columns=["Risk"]), - cmap="Greys", - row_colors=row_colors, - figsize=(10,10), - dendrogram_ratio=(0.1, 0.1), - linewidths=0.5, - linecolor="grey", - annot=True, - col_cluster=False, - row_cluster=False, - clip_on=False, - label=False, - cbar_kws={ - "label": "Counts", - "ticks": [0, 1, 2, 3, 4, 5], - "shrink": 0.5, - "orientation": "horizontal", - }, - vmin=0, vmax=5 - ) - - # Add a "5+" label to the last tick - cbar = cluster_grid.ax_cbar # Access the color bar - cbar.set_xticks([0, 1, 2, 3, 4, 5]) # Set tick positions - cbar.set_xticklabels(["0", "1", "2", "3", "4", "5+"]) - - - # Add a custom legend - legend_handles = [Patch(color=color, label=label) for label, color in score_colors.items()] - cluster_grid.ax_heatmap.legend( - handles=legend_handles, - title="Risk Assessment", - loc="upper center", - bbox_to_anchor=(0.5, 1.2), # Position the legend below the plot - ncol=5, # Number of columns in the legend - frameon=True - ) - - # Save the plot - plt.savefig(output_plot, format="svg", bbox_inches="tight") - - # Heatmap matrix with Final Scores - print(heatmap_matrix) - heatmap_matrix.to_csv(output_file, sep="\t") - # qc @redirect_logging(logfile_param="logfile") def join_read_mapq(file_list, prefixes, output_file, logfile): @@ -1104,3 +787,28 @@ def fragmentation_read_match_distribution(data, fragment_specifier, outpath, log outfile = outpath + str("/") + f'{fragment_specifier}_read_match_fragmentation_distribution.png' plt.savefig(outfile, bbox_inches='tight', dpi=600) plt.close() + +#cmod +@redirect_logging(logfile_param="logfile") +def get_inserted_fasta_seq(fasta, coordinates, output_file, logfile): + """ + Uses FASTA with insertion carrying reads and extracts actual insertion sequence. Used for subsequent multiple sequence alignment + """ + print(fasta) + print(coordinates) + print("files") + border_dict = json.load(open(coordinates)) + print(border_dict) + print(fasta) + with open(output_file, "w") as out_f: + with open(fasta, "r") as f: + fasta_dict = SeqIO.to_dict(SeqIO.parse(f, "fasta")) + # Create a new dictionary without the suffix + fasta_dict_no_suffix = {seq_id.split('_')[0]: record for seq_id, record in fasta_dict.items()} + for seq_id, borders in border_dict.items(): + if seq_id in fasta_dict_no_suffix: + for start, end in zip(borders[::2], borders[1::2]): + sequence = fasta_dict_no_suffix[seq_id].seq[start:end] + out_f.write(">"+str(seq_id)+"\n"+str(sequence) + "\n") + f.close() + out_f.close() diff --git a/workflow/scripts/VIS_plot_functional_genomics_helper_functions.py b/workflow/scripts/VIS_plot_functional_genomics_helper_functions.py new file mode 100644 index 0000000..6216545 --- /dev/null +++ b/workflow/scripts/VIS_plot_functional_genomics_helper_functions.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 + +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.patches import Patch + +#wrapper +from VIS_helper_functions import redirect_logging + +@redirect_logging(logfile_param="logfile") +def plot_element_distance(bed, distances, distance_threshold, output_path, logfile): + """ + Uses the bed file from the distance calculations and provides a plot to visualize the respective elements with their distance. Entries that are further away than the defined threshold value are excluded. + """ + # Read the table + df = pd.read_csv( + bed, + sep='\t', + header=None + ) + + print(df.head()) + df.columns = [ + "InsertionChromosome", + "InsertionStart", + "InsertionEnd", + "InsertionRead", + "InsertionOrigStart", + "InsertionOrigEnd", + "InsertionStrand", + "AnnotationChromosome", + "AnnotationStart", + "AnnotationEnd", + "AnnotationID", + "AnnotationScore", + "AnnotationStrand", + "AnnotationSource", + "Distance" + ] + + print(df.head()) + # Apply threshold filtering if provided + if distance_threshold is not None: + df = df[df['Distance'].abs() <= int(distance_threshold)] + + # Ensure absolute distance and sort by absolute distance within groups + df['abs_distance'] = df['Distance'].abs() + df = df.sort_values(by=['InsertionRead', 'abs_distance']).drop_duplicates(subset=['InsertionRead', 'AnnotationID', "AnnotationSource"], keep='first').reset_index() + + # Create scatter plot + sns.scatterplot( + data=df, + x='Distance', + y='AnnotationID', + hue='InsertionRead', + palette='tab10', + s=100, + style='AnnotationSource' + ) + + # Binned rugplot for distances + bin_size = 100 # Bin size grouping distances + df['distance_bin'] = (df['Distance'] // bin_size) * bin_size + sns.rugplot(x=df['distance_bin'], color='black', height=0.05, linewidth=1) + + # Configure plot aesthetics + plt.xticks(sorted({x for n in distances for x in (n, -n)}), rotation=45) + plt.xlabel("Distance (bp)") + plt.ylabel("Element Name") + plt.title("Distance Distribution to Elements") + sns.despine() + plt.legend(title="", fontsize=8) + + # Save plot + plt.savefig(output_path, dpi=300) + plt.close() + print(f"Plot saved to {output_path}") + +@redirect_logging(logfile_param="logfile") +def plot_element_distance_violin(bed, distances, distance_threshold, output_path, logfile): + # Read the table + df = pd.read_csv( + bed, + sep='\t', + ) + + print(df.head()) + df.columns = [ + "InsertionChromosome", + "InsertionStart", + "InsertionEnd", + "InsertionRead", + "InsertionOrigStart", + "InsertionOrigEnd", + "InsertionStrand", + "AnnotationChromosome", + "AnnotationStart", + "AnnotationEnd", + "AnnotationID", + "AnnotationScore", + "AnnotationStrand", + "AnnotationSource", + "Distance" + ] + + # Apply threshold filtering if provided + if distance_threshold is not None: + df = df[df['Distance'].abs() <= int(distance_threshold)] + + # Create a count of how many times each source appears at each distance + distance_counts = df.groupby(['Distance', 'AnnotationSource', 'InsertionRead']).size().reset_index(name='count') + + # Create the bar plot + print(distance_counts.head()) + plt.figure(figsize=(10, 6)) + sns.displot( + data=distance_counts, + x='Distance', y='count', hue='InsertionRead', col='AnnotationSource', + palette='Set2' + ) + + # Customize the plot + plt.xticks(rotation=45) + plt.xlabel("Distance (bp)") + plt.ylabel("Count of Sources") + plt.title("Distribution of Sources at Different Distances") + sns.despine() + + # Save the plot + plt.tight_layout() # To ensure everything fits without overlap + plt.savefig(output_path, dpi=300) + plt.close() + + print(f"Plot saved to {output_path}") + + +@redirect_logging(logfile_param="logfile") +def scoring_insertions(data, output_plot, output_file, logfile): + """ + Uses custom conditions to visualize the entries of the annotated insertion summary table. + """ + colnames =[ + "InsertionChromosome", + "InsertionStart", + "InsertionEnd", + "InsertionRead", + "InsertionOrigStart", + "InsertionOrigEnd", + "InsertionStrand", + "AnnotationChromosome", + "AnnotationStart", + "AnnotationEnd", + "AnnotationID", + "AnnotationScore", + "AnnotationStrand", + "AnnotationSource", + "Distance" + ] + + df = pd.read_csv( + data, + sep='\t', + names=colnames + ) + # Drop duplicate entries + df = df.drop_duplicates().reset_index(drop=True) + + # Only distance = 0 matters here + #df = df[df["Distance"] == 0] + + # Define conditions + conditions = [ + ("cosmic", 0), ("tf", 0), ("intron", 0), ("hic", 0), ("exon", 0), ("promoter", 0) + ] + + for source, distance in conditions: + df[f"{source}_0"] = df["AnnotationSource"].str.contains(source) & (df["Distance"] == 0) + + # Aggregate data + heatmap_data = df.groupby(["InsertionRead", "InsertionChromosome", "InsertionStart", "InsertionEnd"]).agg("sum").reset_index() + + print(heatmap_data) + # Select numeric columns + heatmap_matrix = heatmap_data.drop(columns=colnames) + heatmap_matrix.index = heatmap_data["InsertionChromosome"] + "_" + \ + heatmap_data["InsertionStart"].astype(str) + "_" + \ + heatmap_data["InsertionEnd"].astype(str) + + + # Calculate Final Score + def calculate_score(row): + if row.loc["intron_0"] < 1 and row.loc["exon_0"] < 1 and row.loc["promoter_0"] < 1: + if row.loc["tf_0"] < 1 and row.loc["hic_0"] < 1: + return "0" + else: + return "1" + elif row.loc["intron_0"] >= 1 and row.loc["exon_0"] < 1: + if row.loc["tf_0"] >= 1 or row.loc["hic_0"] >= 1: + if row.loc["cosmic_0"] < 1: + return "2" + else: + return "4" + else: + if row.loc["cosmic_0"] < 1: + return "1" + else: + return "4" + elif row.loc["exon_0"] >= 1: + if row.loc["tf_0"] > 1 or row.loc["hic_0"] > 1: + if row.loc["cosmic_0"] < 1: + return "3" + else: + return "4" + else: + if row.loc["cosmic_0"] < 1: + return "2" + else: + return "4" + elif row.loc["promoter_0"] >= 1: + if row.loc["tf_0"] > 1 or row.loc["hic_0"] > 1: + if row.loc["cosmic_0"] < 1: + return "3" + else: + return "4" + else: + if row.loc["cosmic_0"] < 1: + return "3" + else: + return "4" + else: + return "undefined" + + heatmap_matrix["Risk"] = heatmap_matrix.apply(calculate_score, axis=1) + + # Map scores to colors + score_colors = { + "4": "red", + "3": "orange", + "2": "yellow", + "1": "lightgrey", + "0": "green" + } + row_colors = heatmap_matrix["Risk"].map(score_colors) + + # Prepare row color map + row_color_cmap = pd.DataFrame({ + "Risk": row_colors + }) + + #pretty names + heatmap_matrix.columns = ["Cancer gene", "TF", "Intron", "HiC", "Exon", "Promoter", "Risk"] + + # Plot clustermap with annotated row colors and custom colormap + cluster_grid = sns.clustermap( + heatmap_matrix.drop(columns=["Risk"]), + cmap="Greys", + row_colors=row_colors, + figsize=(10,10), + dendrogram_ratio=(0.1, 0.1), + linewidths=0.5, + linecolor="grey", + annot=True, + col_cluster=False, + row_cluster=False, + clip_on=False, + label=False, + cbar_kws={ + "label": "Counts", + "ticks": [0, 1, 2, 3, 4, 5], + "shrink": 0.5, + "orientation": "horizontal", + }, + vmin=0, vmax=5 + ) + + # Add a "5+" label to the last tick + cbar = cluster_grid.ax_cbar # Access the color bar + cbar.set_xticks([0, 1, 2, 3, 4, 5]) # Set tick positions + cbar.set_xticklabels(["0", "1", "2", "3", "4", "5+"]) + + + # Add a custom legend + legend_handles = [Patch(color=color, label=label) for label, color in score_colors.items()] + cluster_grid.ax_heatmap.legend( + handles=legend_handles, + title="Risk Assessment", + loc="upper center", + bbox_to_anchor=(0.5, 1.2), # Position the legend below the plot + ncol=5, # Number of columns in the legend + frameon=True + ) + + # Save the plot + plt.savefig(output_plot, format="svg", bbox_inches="tight") + + # Heatmap matrix with Final Scores + print(heatmap_matrix) + heatmap_matrix.to_csv(output_file, sep="\t")