Skip to content

Commit f44dc7b

Browse files
ajlee21dongbohuJake Crawford
authored
Add enrichment analyses (#54)
* create new directory for enrichment analysis * move files * update env to include gsva * add GSVA method * update env to run ROAST * update scripts and notebooks to run ROAST * update scripts to use multiple enrichment methods and update test notebook * fix error in test * fix assert statments * fix file path for test * update gsa param based on updated scripts * add CAMERA method and start formating * Update README.md (#56) * add clusterProfiler to env * update function and nb to run ORA * update comment about ORA output * Compare generic genes across two template experiments (#53) * update recount2 volcano plot * add new template files for each recount2 template id * add new notebook to compare trends generated using two different template experiments * update figure after rerun simulation * clean up notebook comments * update fig generation based on comments * update comment about custom legend in plotting * update notebook based on PR * update enrichment scripts that were causing output errors * update analysis notebook * format enrichment outputs * plot summary ranking trend * run enrichment analyses and add result files * update comments * update comments about takeaway and methods * Update generic_expression_patterns_modules/ranking.py Co-authored-by: Jake Crawford <kitttttens@yahoo.com> * add description to enrichment method functions * update notebooks based on comments Co-authored-by: dongbohu <dongbo.hu@gmail.com> Co-authored-by: Jake Crawford <kitttttens@yahoo.com>
1 parent c7f8e8e commit f44dc7b

30 files changed

Lines changed: 13996 additions & 339 deletions

configs/config_human_general.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ reference_rank_col "DE_Prior_Rank"
1414
rank_genes_by "log2FoldChange"
1515
pathway_DB_filename "/home/alexandra/Documents/Data/Generic_expression_patterns/hallmark_DB.gmt"
1616
gsea_statistic 'log2FoldChange'
17-
rank_pathways_by "padj"
17+
rank_pathways_by "FDR"
1818
NN_architecture "NN_2500_30"
1919
learning_rate 0.001
2020
batch_size 10

environment.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ dependencies:
1212
- bioconda::bioconductor-deseq2
1313
- bioconda::bioconductor-recount
1414
- bioconda::bioconductor-fgsea
15+
- bioconda::bioconductor-gsva
16+
- bioconda::bioconductor-DEFormats
17+
- bioconda::bioconductor-clusterprofiler
1518
- conda-forge::python=3.7
1619
- conda-forge::scikit-learn
1720
- conda-forge::pandas=0.23.4

generic_expression_patterns_modules/DE_analysis.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Functions to perform DE analysis using
2+
# Limma (microarray) and DESeq2 (RNA-seq)
3+
14
# Load libraries
25
library("limma")
36
library("DESeq2")

generic_expression_patterns_modules/GSEA_analysis.R

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# GSEA function
2+
13
# Load libraries
24
library("GSA")
35
library("fgsea")
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# Alternative enrichment analyses to GSEA
2+
# These are methods were implemented, expecting
3+
# RNA-seq input. These will need to be adjusted if
4+
# using microarray data
5+
6+
# Load libraries
7+
library("fgsea")
8+
library("limma")
9+
library("GSVA")
10+
library("DEFormats")
11+
library("edgeR")
12+
library("DESeq2")
13+
library("clusterProfiler")
14+
15+
find_enriched_pathways_ROAST <- function(expression_filename,
16+
metadata_filename,
17+
pathway_DB_filename) {
18+
19+
# ---------------------------------------------------------
20+
# ROAST (rotation gene set tests) performs a focused gene set
21+
# testing, in which interest focuses on a few gene sets as opposed
22+
# to a large dataset. (available in limma).
23+
# * Self contained gene set test
24+
# * Instead of permutations they use rotations
25+
# (i.e. fractional permutation) in order to allow for more
26+
# complex experimental designs than binary experiments
27+
# (i.e. time-course, more than 2 groups)
28+
# * Also esimates correlation between genes
29+
# * Best power to detect modest fold changes with minority of genes changed
30+
# (https://pubmed.ncbi.nlm.nih.gov/20610611/)
31+
# ---------------------------------------------------------
32+
33+
# Read data
34+
expression_data <- t(as.matrix(read.csv(expression_filename, sep="\t", header=TRUE, row.names=1)))
35+
metadata <- as.matrix(read.csv(metadata_filename, sep="\t", header=TRUE, row.names=1))
36+
pathway_DB_data <- gmtPathways(pathway_DB_filename)
37+
38+
group <- interaction(metadata[,1])
39+
40+
# Create DEGList based on counts
41+
#dge = DGEList(expression_data, group=group)
42+
43+
# Design matrix
44+
design <- model.matrix(~0 + group)
45+
46+
expression_data_voom <- voom(expression_data, design)
47+
#print(expression_data_voom)
48+
49+
# Estimate dispersions
50+
#y <- estimateDisp(dge, design)
51+
52+
# Format index
53+
gene_ids <- row.names(expression_data_voom)
54+
pathway_ind <- ids2indices(pathway_DB_data, gene_ids)
55+
56+
print("Checking sample ordering...")
57+
print(all.equal(colnames(expression_data_voom), rownames(metadata)))
58+
59+
# Call ROAST
60+
enrich_pathways <- mroast(expression_data_voom, index=pathway_ind, design, contrast=ncol(design), nrot=10000, adjust.method="BH")
61+
62+
return(as.data.frame(enrich_pathways))
63+
}
64+
65+
find_enriched_pathways_CAMERA <- function(expression_filename,
66+
metadata_filename,
67+
pathway_DB_filename) {
68+
69+
# ---------------------------------------------------------
70+
# CAMERA (Correlation Adjusted MEan RAnk gene set test)
71+
# is based on the idea of estimating the variance inflation
72+
# factor associated with inter-gene correlation, and
73+
# incorporating this into parametric or rank-based
74+
# test procedures. (available in limma)
75+
# * Competitive gene set test
76+
# * Performs the same rank-based test procedure as GSEA,
77+
# but also estimates the correlation between genes,
78+
# instead of treating genes as independent
79+
# * Recall GSEA: 1) Rank all genes using DE association statistics.
80+
# 2) An enrichment score (ES) is defined as the maximum distance
81+
# from the middle of the ranked list. Thus, the enrichment score
82+
# indicates whether the genes contained in a gene set are clustered
83+
# towards the beginning or the end of the ranked list
84+
# (indicating a correlation with change in expression).
85+
# 3) Estimate the statistical significance of the ES by a
86+
# phenotypic-based permutation (permute samples assigned to label)
87+
# test in order to produce a null distribution for the ES
88+
# (i.e. scores based on permuted phenotype)
89+
# * Appropriate for small and large fold changes
90+
# (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3458527/)
91+
# ---------------------------------------------------------
92+
93+
# Read data
94+
expression_data <- t(as.matrix(read.csv(expression_filename, sep="\t", header=TRUE, row.names=1)))
95+
metadata <- as.matrix(read.csv(metadata_filename, sep="\t", header=TRUE, row.names=1))
96+
pathway_DB_data <- gmtPathways(pathway_DB_filename)
97+
98+
print("Checking sample ordering...")
99+
print(all.equal(colnames(expression_data), rownames(metadata)))
100+
101+
group <- interaction(metadata[,1])
102+
103+
# Create DEGList based on counts
104+
dge = DGEList(expression_data, group=group)
105+
106+
# Design matrix
107+
design <- model.matrix(~0 + group)
108+
109+
# Estimate dispersions
110+
y <- estimateDisp(dge, design)
111+
112+
# Call CAMERA
113+
enrich_pathways <- camera(y, pathway_DB_data, design, contrast=ncol(design), nrot=10000)
114+
115+
return(as.data.frame(enrich_pathways))
116+
}
117+
118+
find_enriched_pathways_GSVA <- function(expression_filename,
119+
pathway_DB_filename) {
120+
121+
# ---------------------------------------------------------
122+
# GSVA(Gene Set Variation Analysis) calculates sample-wise
123+
# gene set enrichment scores as a function of genes inside
124+
# and outside the gene set. This method is well-suited for
125+
# assessing gene set variation across a dichotomous phenotype.
126+
# (biocontuctor package GSVA)
127+
# * Competitive gene set test
128+
# * Estimates variation of gene set enrichment over the samples
129+
# independently of any class label
130+
# (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3618321/)
131+
# ---------------------------------------------------------
132+
133+
# Read in expression data
134+
# Transpose to gene x sample
135+
expression_data <- t(read.table(expression_filename,
136+
sep = "\t",
137+
header = TRUE,
138+
row.names = NULL))
139+
pathway_DB_data <- gmtPathways(pathway_DB_filename)
140+
141+
enrich_pathways <- gsva(expression_data,
142+
pathway_DB_data,
143+
kcdf="Poisson",
144+
parallel.sz=1,
145+
verbose=TRUE
146+
)
147+
return(as.data.frame(enrich_pathways))
148+
}
149+
150+
find_enriched_pathways_ORA <- function(expression_filename,
151+
metadata_filename,
152+
pathway_DB_filename
153+
) {
154+
# ---------------------------------------------------------
155+
# ORA (over-representation analysis) uses the hypergeometric
156+
# test to determine if there a significant over-representation
157+
# of pathway in the selected set of DEGs. Here we're using
158+
# clusterProfiler library but there are multiple options for this analysis.
159+
# (https://www.rdocumentation.org/packages/clusterProfiler/versions/3.0.4/topics/enricher)
160+
# ---------------------------------------------------------
161+
162+
# Read data
163+
expression_data <- t(as.matrix(read.csv(expression_filename, sep="\t", header=TRUE, row.names=1)))
164+
metadata <- as.matrix(read.csv(metadata_filename, sep="\t", header=TRUE, row.names=1))
165+
166+
print("Checking sample ordering...")
167+
print(all.equal(colnames(expression_data), rownames(metadata)))
168+
169+
group <- interaction(metadata[,1])
170+
171+
mm <- model.matrix(~0 + group)
172+
173+
ddset <- DESeqDataSetFromMatrix(expression_data, colData=metadata, design = ~group)
174+
175+
deseq_object <- DESeq(ddset)
176+
177+
# Note parameter settings:
178+
# `independentFilter=False`: We have turned off the automatic filtering, which
179+
# filter filter out those tests from the procedure that have no, or little
180+
# chance of showing significant evidence, without even looking at their test statistic.
181+
# Typically, this results in increased detection power at the same experiment-wide
182+
# type I error, as measured in terms of the false discovery rate.
183+
# cooksCutoff=True (default): Cook's distance as a diagnostic to tell if a single sample
184+
# has a count which has a disproportionate impact on the log fold change and p-values.
185+
# These genes are flagged with an NA in the pvalue and padj columns
186+
deseq_results <- results(deseq_object, independentFiltering=FALSE)
187+
188+
deseq_results_df <- as.data.frame(deseq_results)
189+
190+
# Get DEGs
191+
threshold=0.05
192+
backgrd_genes <- row.names(deseq_results_df)
193+
degs <- deseq_results_df[deseq_results_df[,'padj']<threshold & abs(deseq_results_df[,'log2FoldChange'])>1,]
194+
degs_name <- row.names(degs)
195+
196+
# Get over-represented pathways
197+
pathway_DB_data <- read.gmt(pathway_DB_filename)
198+
199+
enrich_pathways <- enricher(degs_name,
200+
universe=backgrd_genes,
201+
pvalueCutoff=0.05,
202+
pAdjustMethod="BH",
203+
TERM2GENE=pathway_DB_data[, c("ont", "gene")]
204+
)
205+
return(as.data.frame(summary(enrich_pathways)))
206+
}

0 commit comments

Comments
 (0)