-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript7_annotateFeatures.r
More file actions
2928 lines (2327 loc) · 125 KB
/
script7_annotateFeatures.r
File metadata and controls
2928 lines (2327 loc) · 125 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#####======================================================================#####
### Create feature annotation object(s) for 450K platform
#####======================================================================#####
##Author: Mattias Aine (mattias.aine@med.lu.se)
##Affiliation: Lund University / Oncoloy and Pathology
################################################################################
##Set home directory
##set/create own home directory below:
##work
HOME<-"~/hdd1/tcgaBrca"
MANIFEST<-"~/Documents/tcgaBrca/manifest"
##home
HOME<-"I:/data/tcgaBrca"
MANIFEST<-"F:/gitProjects/tcgaBrca/manifest"
##tumor type
TUMOR_TYPE<-"brca"
list.files(MANIFEST,full.names=T)
#[1] "/home/med-mai/Documents/tcgaBrca/manifest/atac"
#[2] "/home/med-mai/Documents/tcgaBrca/manifest/brca"
#[5] "/home/med-mai/Documents/tcgaBrca/manifest/pancan"
##create data directories
if( !file.exists( paste0(HOME,"/","annotateFeatures") ) ) {
dir.create(paste0(HOME,"/","annotateFeatures"),recursive=TRUE)
}
################################################################################
##load required packages
if(!requireNamespace("RColorBrewer", quietly = TRUE)) {
install.packages("RColorBrewer") }
library("RColorBrewer")
if(!requireNamespace("pheatmap", quietly = TRUE)) {
install.packages("pheatmap") }
library("pheatmap")
if(!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr") }
library("dplyr")
if (!requireNamespace("GenomicRanges", quietly = TRUE)) {
BiocManager::install("GenomicRanges") }
library("GenomicRanges")
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
BiocManager::install("org.Hs.eg.db") }
library("org.Hs.eg.db")
writeLines(apply(metadata(org.Hs.eg.db),1,paste,collapse=": "),con=paste0(HOME,"/annotateFeatures/org.Hs.eg.db_versionInfo.txt"))
if (!requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE)) {
BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene") }
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
writeLines(apply(metadata(TxDb.Hsapiens.UCSC.hg38.knownGene),1,paste,collapse=": "),con=paste0(HOME,"/annotateFeatures/TxDb.Hsapiens.UCSC.hg38.knownGene_versionInfo.txt"))
##md5
if(!requireNamespace("tools", quietly = TRUE)) {
install.packages("tools") }
library(tools)
if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly = TRUE)) {
BiocManager::install("BSgenome.Hsapiens.UCSC.hg38") }
library("BSgenome.Hsapiens.UCSC.hg38")
sink(file=paste0(HOME,"/annotateFeatures/BSgenome.Hsapiens.UCSC.hg38_versionInfo.txt"))
BSgenome.Hsapiens.UCSC.hg38
sink()
if (!requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings") }
library("Biostrings")
################################################################################
##Get core tumor set from methylation data
load(file=paste0(HOME,"/","workspace_blacklistFiltered_atacCnGexMeWes_withSampleAnnotations.RData"))
ls()
# [1] "betaOrig" "dataAtac" "dataCn" "dataMut" "dataSeg"
# [6] "gexCounts" "gexFpkm" "gexUq" "HOME" "MANIFEST"
# [11] "sampleAnno" "TUMOR_TYPE"
#object - "betaAdj" - purity adjusted betas
load(file=paste0(HOME,"/me/data450k_",nrow(betaOrig),"x",ncol(betaOrig),"_minfiNormalized_ringnerAdjusted_purityAdjusted_tumorBetaValues.RData"))
str(betaAdj)
# num [1:421368, 1:630] 0.742 0.756 0.118 0.967 0.904 0.7 0.966 1 0.985 1 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:421368] "cg21870274" "cg08258224" "cg16619049" "cg18147296" ...
# ..$ : chr [1:630] "TCGA-3C-AAAU-01A" "TCGA-3C-AALI-01A" "TCGA-3C-AALJ-01A" "TCGA-3C-AALK-01A" ...
#object - "betaNorm" - inferred normal methylation
load(file=paste0(HOME,"/me/data450k_",nrow(betaOrig),"x",ncol(betaOrig),"_minfiNormalized_ringnerAdjusted_purityAdjusted_normalBetaValues.RData"))
str(betaNorm)
# num [1:421368, 1:630] 0.584 0.824 0.59 0.686 0.662 0.313 0.861 0.997 0.992 0.996 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:421368] "cg21870274" "cg08258224" "cg16619049" "cg18147296" ...
# ..$ : chr [1:630] "TCGA-3C-AAAU-01A" "TCGA-3C-AALI-01A" "TCGA-3C-AALJ-01A" "TCGA-3C-AALK-01A" ...
##get probesKeep
load(file=paste0(HOME,"/me/object_450k_probesKeep.RData"))
################################################################################
###Get transcript/gene data for TCGA GEX
str(gexFpkm)
# num [1:60483, 1:630] 0 0 3.68 0 1.83 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:60483] "ENSG00000242268.2" "ENSG00000270112.3" "ENSG00000167578.15" "ENSG00000273842.1" ...
# ..$ : chr [1:630] "TCGA-3C-AAAU-01A" "TCGA-3C-AALI-01A" "TCGA-3C-AALJ-01A" "TCGA-3C-AALK-01A" ...
identical(rownames(gexFpkm),rownames(gexCounts))
#[1] TRUE
identical(rownames(gexFpkm),rownames(gexUq))
#[1] TRUE
##https://gdc.cancer.gov/about-data/gdc-data-processing/gdc-reference-files
##Use GDC files - GDC.h38 GENCODE v22 GTF (used in RNA-Seq alignment and by HTSeq)
# tmp<-tempfile()
# download.file("https://api.gdc.cancer.gov/data/25aa497c-e615-4cb7-8751-71f744f9691f",tmp)
#as.vector(md5sum(tmp))
# genes.gtf<-readLines(tmp)
##Use GDC files - GDC.h38 GENCODE TSV
tmp<-tempfile()
download.file("https://api.gdc.cancer.gov/data/b011ee3e-14d8-4a97-aed4-e0b10f6bbe82",tmp)
as.vector(md5sum(tmp))
#[1] "3c1b05ce5c978370ed5e1c2f074f44b0"
genes.csv<-read.table(tmp,header=T,sep="\t",as.is=TRUE)
str(genes.csv)
# 'data.frame': 60483 obs. of 12 variables:
# $ gene_id : chr "ENSG00000223972.5" "ENSG00000238009.5" "ENSG00000230415.1" "ENSG00000236335.1" ...
# $ gene_name : chr "DDX11L1" "RP11-34P13.7" "RP5-902P8.10" "RP4-591L5.1" ...
# $ seqname : chr "chr1" "chr1" "chr1" "chr1" ...
# $ start : int 11869 89295 1275223 30409560 32752910 32785646 32818018 32951574 32996608 33033863 ...
# $ end : int 14409 133723 1280420 30411638 32753901 32786116 32897826 32956349 33097230 33034810 ...
# $ strand : chr "+" "-" "+" "-" ...
# $ gene_type : chr "transcribed_unprocessed_pseudogene" "lincRNA" "lincRNA" "lincRNA" ...
# $ gene_status: chr "KNOWN" "NOVEL" "NOVEL" "NOVEL" ...
# $ havana_gene: chr "OTTHUMG00000000961.2" "OTTHUMG00000001096.2" "OTTHUMG00000002234.2" "OTTHUMG00000003682.1" ...
# $ full_length: int 2541 44429 5198 2079 992 471 79809 4776 100623 948 ...
# $ exon_length: int 1735 3726 513 507 992 471 8685 3095 4364 948 ...
# $ exon_num : int 9 17 5 3 1 1 4 2 123 1 ...
length(intersect(rownames(gexFpkm),genes.csv$gene_id))
#[1] 60483
all(genes.csv$gene_id %in% rownames(gexFpkm) )
#[1] TRUE
all(rownames(gexFpkm) %in% genes.csv$gene_id )
#[1] TRUE
length(unique(genes.csv$gene_id))
#[1] 60483
length(unique(sub("\\..*","",genes.csv$gene_id)))
#[1] 60483
head(unique(sub("\\..*","",genes.csv$gene_id)))
#[1] "ENSG00000223972" "ENSG00000238009" "ENSG00000230415" "ENSG00000236335" "ENSG00000213842" "ENSG00000227337"
##every ensg is unique
rownames(genes.csv)<-sub("\\..*","",genes.csv$gene_id)
genes.csv$gene_id2<-sub("\\..*","",genes.csv$gene_id)
length(unique(genes.csv$gene_name))
#[1] 58387
table(genes.csv$gene_type)
# 3prime_overlapping_ncrna antisense IG_C_gene IG_C_pseudogene
# 29 5565 14 9
# IG_D_gene IG_J_gene IG_J_pseudogene IG_V_gene
# 37 18 3 142
# IG_V_pseudogene lincRNA macro_lncRNA miRNA
# 180 7656 1 4093
# misc_RNA Mt_rRNA Mt_tRNA non_coding
# 2298 2 22 3
# polymorphic_pseudogene processed_pseudogene processed_transcript protein_coding
# 59 10304 484 19814
# pseudogene ribozyme rRNA scaRNA
# 36 8 544 49
# sense_intronic sense_overlapping snoRNA snRNA
# 920 197 961 1896
# sRNA TEC TR_C_gene TR_D_gene
# 20 1045 5 3
# TR_J_gene TR_J_pseudogene TR_V_gene TR_V_pseudogene
# 73 4 106 30
# transcribed_processed_pseudogene transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene translated_processed_pseudogene
# 443 1 663 1
# translated_unprocessed_pseudogene unitary_pseudogene unprocessed_pseudogene vaultRNA
# 1 169 2574 1
table(genes.csv$gene_status)
# KNOWN NOVEL PUTATIVE
# 41346 19008 129
table(genes.csv$gene_type,genes.csv$gene_status)
# KNOWN NOVEL PUTATIVE
# 3prime_overlapping_ncrna 1 28 0
# antisense 244 5321 0
# IG_C_gene 14 0 0
# IG_C_pseudogene 9 0 0
# IG_D_gene 37 0 0
# IG_J_gene 18 0 0
# IG_J_pseudogene 3 0 0
# IG_V_gene 137 5 0
# IG_V_pseudogene 180 0 0
# lincRNA 413 7231 12
# macro_lncRNA 1 0 0
# miRNA 1707 2386 0
# misc_RNA 948 1350 0
# Mt_rRNA 2 0 0
# Mt_tRNA 22 0 0
# non_coding 3 0 0
# polymorphic_pseudogene 59 0 0
# processed_pseudogene 10302 2 0
# processed_transcript 66 415 3
# protein_coding 19308 392 114
# pseudogene 6 30 0
# ribozyme 3 5 0
# rRNA 516 28 0
# scaRNA 18 31 0
# sense_intronic 25 895 0
# sense_overlapping 7 190 0
# snoRNA 382 579 0
# snRNA 1798 98 0
# sRNA 0 20 0
# TEC 1045 0 0
# TR_C_gene 5 0 0
# TR_D_gene 3 0 0
# TR_J_gene 72 1 0
# TR_J_pseudogene 4 0 0
# TR_V_gene 105 1 0
# TR_V_pseudogene 30 0 0
# transcribed_processed_pseudogene 443 0 0
# transcribed_unitary_pseudogene 1 0 0
# transcribed_unprocessed_pseudogene 663 0 0
# translated_processed_pseudogene 1 0 0
# translated_unprocessed_pseudogene 1 0 0
# unitary_pseudogene 169 0 0
# unprocessed_pseudogene 2574 0 0
# vaultRNA 1 0 0
##use all - can select subsets for analysis if needed
##add ENTREZ as well for those with available
##With entrez and using fpkm-rownames
db<-org.Hs.eg.db
id<-sub("\\..*","",rownames(gexFpkm))
head(id)
# [1] "ENSG00000242268" "ENSG00000270112" "ENSG00000167578" "ENSG00000273842"
# [5] "ENSG00000078237" "ENSG00000146083"
keytypes(org.Hs.eg.db)
# [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
# [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
# [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
# [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
# [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG" "UNIGENE"
# [26] "UNIPROT"
#sym<-mapIds(db, keys=id, column=c("SYMBOL"), keytype="ENSEMBL",multiVals="first")
sym<-mapIds(db, keys=id, column=c("ENTREZID"), keytype="ENSEMBL",multiVals="first")
str(sym)
# Named chr [1:60483] "100507661" NA "53916" NA "57103" "22838" NA "55567" ...
# - attr(*, "names")= chr [1:60483] "ENSG00000242268" "ENSG00000270112" "ENSG00000167578" "ENSG00000273842" ...
length(sym)
#[1] 60483
length(unique(sym))
#[1] 25555
table(is.na(sym))
# FALSE TRUE
# 25629 34854
length(unique(names(sym)))
#[1] 60483
sym[is.na(sym)]<-""
table(sym=="")
# FALSE TRUE
# 25629 34854
all(names(sym) %in% genes.csv$gene_id2)
#[1] TRUE
all(genes.csv$gene_id2 %in% names(sym))
#[1] TRUE
length(intersect(genes.csv$gene_id2 , names(sym))) == length(sym)
#[1] TRUE
sym<-sym[genes.csv$gene_id2]
all(names(sym) == genes.csv$gene_id2)
#[1] TRUE
genes.csv$orgHs_entrez<-as.vector(sym)
##Add orgHs - symbol
sym<-mapIds(db, keys=id, column=c("SYMBOL"), keytype="ENSEMBL",multiVals="first")
str(sym)
# Named chr [1:60483] "LINC02082" NA "RAB4B" NA "TIGAR" "RNF44" NA "DNAH3" "RPL23A" "EHD4-AS1" NA NA NA "ARL8B" NA "CALB2" "MFSD3" NA "LINC00636" "PIGV" ...
# - attr(*, "names")= chr [1:60483] "ENSG00000242268" "ENSG00000270112" "ENSG00000167578" "ENSG00000273842" ...
length(sym)
#[1] 60483
length(unique(sym))
#[1] 25555
table(is.na(sym))
# FALSE TRUE
# 25629 34854
length(unique(names(sym)))
#[1] 60483
sym[is.na(sym)]<-""
table(sym=="")
# FALSE TRUE
# 25629 34854
all(names(sym) %in% genes.csv$gene_id2)
#[1] TRUE
all(genes.csv$gene_id2 %in% names(sym))
#[1] TRUE
length(intersect(genes.csv$gene_id2 , names(sym))) == length(sym)
#[1] TRUE
sym<-sym[genes.csv$gene_id2]
all(names(sym) == genes.csv$gene_id2)
#[1] TRUE
genes.csv$orgHs_symbol<-as.vector(sym)
table(genes.csv$gene_name==genes.csv$orgHs_symbol)
# FALSE TRUE
# 38377 22106
table(genes.csv$orgHs_entrez!="",genes.csv$orgHs_symbol!="")
# FALSE TRUE
# FALSE 34854 0
# TRUE 0 25629
table(genes.csv$orgHs_entrez!="",genes.csv$gene_status)
# KNOWN NOVEL PUTATIVE
# FALSE 19313 15440 101
# TRUE 22033 3568 28
table(genes.csv$gene_type,genes.csv$orgHs_entrez!="")
# FALSE TRUE
# 3prime_overlapping_ncrna 28 1
# antisense 4137 1428
# IG_C_gene 14 0
# IG_C_pseudogene 9 0
# IG_D_gene 37 0
# IG_J_gene 18 0
# IG_J_pseudogene 3 0
# IG_V_gene 142 0
# IG_V_pseudogene 180 0
# lincRNA 5500 2156
# macro_lncRNA 1 0
# miRNA 2675 1418
# misc_RNA 2286 12
# Mt_rRNA 2 0
# Mt_tRNA 21 1
# non_coding 3 0
# polymorphic_pseudogene 14 45
# processed_pseudogene 10203 101
# processed_transcript 257 227
# protein_coding 706 19108
# pseudogene 36 0
# ribozyme 7 1
# rRNA 524 20
# scaRNA 31 18
# sense_intronic 842 78
# sense_overlapping 172 25
# snoRNA 540 421
# snRNA 1853 43
# sRNA 19 1
# TEC 1022 23
# TR_C_gene 4 1
# TR_D_gene 3 0
# TR_J_gene 73 0
# TR_J_pseudogene 4 0
# TR_V_gene 106 0
# TR_V_pseudogene 30 0
# transcribed_processed_pseudogene 350 93
# transcribed_unitary_pseudogene 0 1
# transcribed_unprocessed_pseudogene 362 301
# translated_processed_pseudogene 1 0
# translated_unprocessed_pseudogene 0 1
# unitary_pseudogene 118 51
# unprocessed_pseudogene 2520 54
# vaultRNA 1 0
str(genes.csv)
# 'data.frame': 60483 obs. of 15 variables:
# $ gene_id : chr "ENSG00000223972.5" "ENSG00000238009.5" "ENSG00000230415.1" "ENSG00000236335.1" ...
# $ gene_name : chr "DDX11L1" "RP11-34P13.7" "RP5-902P8.10" "RP4-591L5.1" ...
# $ seqname : chr "chr1" "chr1" "chr1" "chr1" ...
# $ start : int 11869 89295 1275223 30409560 32752910 32785646 32818018 32951574 32996608 33033863 ...
# $ end : int 14409 133723 1280420 30411638 32753901 32786116 32897826 32956349 33097230 33034810 ...
# $ strand : chr "+" "-" "+" "-" ...
# $ gene_type : chr "transcribed_unprocessed_pseudogene" "lincRNA" "lincRNA" "lincRNA" ...
# $ gene_status : chr "KNOWN" "NOVEL" "NOVEL" "NOVEL" ...
# $ havana_gene : chr "OTTHUMG00000000961.2" "OTTHUMG00000001096.2" "OTTHUMG00000002234.2" "OTTHUMG00000003682.1" ...
# $ full_length : int 2541 44429 5198 2079 992 471 79809 4776 100623 948 ...
# $ exon_length : int 1735 3726 513 507 992 471 8685 3095 4364 948 ...
# $ exon_num : int 9 17 5 3 1 1 4 2 123 1 ...
# $ gene_id2 : chr "ENSG00000223972" "ENSG00000238009" "ENSG00000230415" "ENSG00000236335" ...
# $ orgHs_entrez: chr "100287102" "100996442" "" "" ...
# $ orgHs_symbol: chr "DDX11L1" "LOC100996442" "" "" ...
##check chromosomes
table(genes.csv$seqname)
# chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2 chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrM chrX chrY
# 5397 2306 3381 3047 1383 2289 2247 2597 3111 1206 2997 4150 1436 880 1385 3163 2633 2993 3001 2980 2444 2350 37 2476 594
##are coordinates BED?
all(genes.csv$end-genes.csv$start == genes.csv$full_length)
#[1] FALSE
##are coordinates actual?
all(1+(genes.csv$end-genes.csv$start) == genes.csv$full_length)
#[1] TRUE
##Make genes GRanges
geneCoords<-GRanges(seqnames=genes.csv$seqname,
ranges=IRanges(start=genes.csv$start,
end=genes.csv$end),
strand=genes.csv$strand
)
names(geneCoords)<-genes.csv$gene_id
geneCoords$gene_id<-genes.csv$gene_id
geneCoords$gene_id2<-genes.csv$gene_id2
geneCoords$gene_name<-genes.csv$gene_name
geneCoords$orgHs_entrez<-genes.csv$orgHs_entrez
geneCoords$orgHs_symbol<-genes.csv$orgHs_symbol
geneCoords$gene_type<-genes.csv$gene_type
geneCoords$gene_status<-genes.csv$gene_status
geneCoords
# GRanges object with 60483 ranges and 7 metadata columns:
# seqnames ranges strand | gene_id gene_id2 gene_name orgHs_entrez orgHs_symbol
# <Rle> <IRanges> <Rle> | <character> <character> <character> <character> <character>
# ENSG00000223972.5 chr1 11869-14409 + | ENSG00000223972.5 ENSG00000223972 DDX11L1 100287102 DDX11L1
# ENSG00000238009.5 chr1 89295-133723 - | ENSG00000238009.5 ENSG00000238009 RP11-34P13.7 100996442 LOC100996442
# ENSG00000230415.1 chr1 1275223-1280420 + | ENSG00000230415.1 ENSG00000230415 RP5-902P8.10
# ENSG00000236335.1 chr1 30409560-30411638 - | ENSG00000236335.1 ENSG00000236335 RP4-591L5.1
# ENSG00000213842.2 chr3 32752910-32753901 + | ENSG00000213842.2 ENSG00000213842 SUGT1P2
# ... ... ... ... . ... ... ... ... ...
# ENSG00000226955.1 chr3 32592972-32593312 - | ENSG00000226955.1 ENSG00000226955 AC104306.4
# ENSG00000231552.1 chr3 32620903-32621902 - | ENSG00000231552.1 ENSG00000231552 IGBP1P3
# ENSG00000237676.1 chr3 32635222-32635554 - | ENSG00000237676.1 ENSG00000237676 RPL30P4
# ENSG00000182973.17 chr3 32685145-32773875 + | ENSG00000182973.17 ENSG00000182973 CNOT10 25904 CNOT10
# ENSG00000251224.1 chr3 32730635-32737454 - | ENSG00000251224.1 ENSG00000251224 CNOT10-AS1
# gene_type gene_status
# <character> <character>
# ENSG00000223972.5 transcribed_unprocessed_pseudogene KNOWN
# ENSG00000238009.5 lincRNA NOVEL
# ENSG00000230415.1 lincRNA NOVEL
# ENSG00000236335.1 lincRNA NOVEL
# ENSG00000213842.2 processed_pseudogene KNOWN
# ... ... ...
# ENSG00000226955.1 processed_pseudogene KNOWN
# ENSG00000231552.1 processed_pseudogene KNOWN
# ENSG00000237676.1 processed_pseudogene KNOWN
# ENSG00000182973.17 protein_coding KNOWN
# ENSG00000251224.1 antisense NOVEL
# -------
# seqinfo: 25 sequences from an unspecified genome; no seqlengths
save(geneCoords,genes.csv,file=paste0(HOME,"/annotateFeatures/object_gdc_GENCODE22_gene_annotations.RData"))
################################################################################
###Make GRanges for TCGA ATAC
str(dataAtac)
# num [1:215920, 1:39] 1.031 1.667 1.839 1.375 0.769 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:215920] "chr1:17234-17733|BRCA_2" "chr1:180634-181133|BRCA_3" "chr1:181207-181706|BRCA_4" "chr1:183557-184056|BRCA_5" ...
# ..$ : chr [1:39] "TCGA-3C-AALJ-01A" "TCGA-4H-AAAK-01A" "TCGA-A2-A0SV-01A" "TCGA-A2-A0SW-01A" ...
table(sub(":.+","",rownames(dataAtac)))
# chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2 chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX
# 21640 9733 11414 10140 4142 6186 6812 7042 13211 4038 9379 15782 7036 2942 4259 12401 7463 9337 15170 10545 15026 8240 3982
quantile(as.integer(sub("chr.+:(.+)-.+","\\1",rownames(dataAtac))))
# 0% 25% 50% 75% 100%
# 10123 33926919 65673646 114911183 248945392
quantile(as.integer(sub("chr.+:(.+)-(.+)\\|.+","\\2",rownames(dataAtac))))
# 0% 25% 50% 75% 100%
# 10622 33927418 65674145 114911682 248945891
##500 bp peaks
quantile(as.integer(sub("chr.+:(.+)-(.+)\\|.+","\\2",rownames(dataAtac))) -
as.integer(sub("chr.+:(.+)-.+","\\1",rownames(dataAtac)))
)
# 0% 25% 50% 75% 100%
# 499 499 499 499 499
atacCoords<-data.frame(seqnames=sub(":.+","",rownames(dataAtac)),
start=as.integer(sub("chr.+:(.+)-.+","\\1",rownames(dataAtac))),
end=as.integer(sub("chr.+:(.+)-(.+)\\|.+","\\2",rownames(dataAtac))),
stringsAsFactors=FALSE
)
atacCoords$name<-sub(".+\\|","",rownames(dataAtac))
names(atacCoords)<-rownames(dataAtac)
str(atacCoords)
# 'data.frame': 215920 obs. of 4 variables:
# $ seqnames: chr "chr1" "chr1" "chr1" "chr1" ...
# $ start : int 17234 180634 181207 183557 184247 191249 267757 778474 779599 827304 ...
# $ end : int 17733 181133 181706 184056 184746 191748 268256 778973 780098 827803 ...
# $ name : chr "BRCA_2" "BRCA_3" "BRCA_4" "BRCA_5" ...
atacRanges<-GRanges(seqnames=atacCoords$seqnames,
ranges=IRanges(start=atacCoords$start,
end=atacCoords$end)
)
names(atacRanges)<-paste(as.character(seqnames(atacRanges)),paste(start(atacRanges),end(atacRanges),sep="-"),sep=":")
#names(atacRanges)<-rownames(dataAtac)
quantile(width(atacRanges))
# 0% 25% 50% 75% 100%
# 500 500 500 500 500
##no overlaps
length(reduce(atacRanges))
#[1] 215920
length(atacRanges)
#[1] 215920
################################################################################
###Make GRanges for Illumina 450K
identical(rownames(betaAdj),rownames(probesKeep))
#[1] TRUE
head(probesKeep)
# id chr start end isBad probeType designType
# cg21870274 cg21870274 chr1 69591 69592 FALSE cg II
# cg08258224 cg08258224 chr1 864703 864704 FALSE cg II
# cg16619049 cg16619049 chr1 870161 870162 FALSE cg I
# cg18147296 cg18147296 chr1 877159 877160 FALSE cg II
# cg13938959 cg13938959 chr1 898803 898804 FALSE cg II
# cg12445832 cg12445832 chr1 898915 898916 FALSE cg II
table(probesKeep$chr)
# chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2 chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY
# 41098 21206 25633 21624 10540 13204 13119 18627 24356 5280 21984 30550 9354 3378 7222 22315 17646 21279 31170 25024 18194 8605 9718 242
pAnno<-GRanges(seqnames=probesKeep$chr,
ranges=IRanges(start=probesKeep$start,
end=probesKeep$end)
)
names(pAnno)<-rownames(probesKeep)
##give CpGs single coordinate - use 3' in CG - otherwise more complicated overlap calculations
start(pAnno)<-end(pAnno)
pAnno
# GRanges object with 421368 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# cg21870274 chr1 69592 *
# cg08258224 chr1 864704 *
# cg16619049 chr1 870162 *
# cg18147296 chr1 877160 *
# cg13938959 chr1 898804 *
# ... ... ... ...
# cg11021362 chrY 21594791 *
# cg25914522 chrY 22306397 *
# cg21106100 chrY 26409390 *
# cg08265308 chrY 26409404 *
# cg14273923 chrY 26409766 *
# -------
# seqinfo: 24 sequences from an unspecified genome; no seqlengths
################################################################################
##create simple annotation matrix
identical(rownames(betaAdj),rownames(probesKeep))
#[1] TRUE
identical(rownames(betaAdj),names(pAnno))
#[1] TRUE
annoObj<-data.frame(illuminaID=rownames(probesKeep),stringsAsFactors=FALSE)
rownames(annoObj)<-rownames(probesKeep)
annoObj$chr<-""
annoObj[rownames(annoObj),"chr"]<-probesKeep$chr
annoObj$start<-0
annoObj[rownames(annoObj),"start"]<-probesKeep$start
annoObj$end<-0
annoObj[rownames(annoObj),"end"]<-probesKeep$end
################################################################################
##add "knowngene" mask - rest are for genes with gex
db<-org.Hs.eg.db
p0<-genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
p0<-p0[as.character(seqnames(p0)) %in% paste0("chr",c(1:22,"X"))]
seqlevels(p0, pruning.mode="coarse") <- paste("chr",c(1:22,"X"),sep="")
#sym<-select(db, keys=p0$gene_id, columns="SYMBOL", keytype="ENTREZID",multiVals="first")
sym<-as.character( mapIds(db, keys=p0$gene_id, column=c("ENTREZID"), keytype="ENTREZID",multiVals="first") )
sym2<-as.character( mapIds(db, keys=p0$gene_id, column=c("SYMBOL"), keytype="ENTREZID",multiVals="first") )
sym2[is.na(sym2)]<-""
sym<-as.data.frame(cbind(ENTREZID=sym,SYMBOL=sym2),stringsAsFactors=FALSE)
str(sym)
# 'data.frame': 25668 obs. of 2 variables:
# $ ENTREZID: chr "1" "10" "100" "1000" ...
# $ SYMBOL : chr "A1BG" "NAT2" "ADA" "CDH2" ...
all(sym$ENTREZID==names(p0))
#[1] TRUE
p0$symbol<-sym$SYMBOL
##Gb in genic regions
sum(width(reduce(p0,ignore.strand=TRUE)))/1e9
#[1] 1.515932
p1<-p0
p1<-p1[as.character(seqnames(p1)) %in% paste0("chr",c(1:22,"X"))]
seqlevels(p1, pruning.mode="coarse") <- paste("chr",c(1:22,"X"),sep="")
p2<-promoters(p1,upstream=5000,downstream=0) ###Need to take orientation into account!!
end(p2)[as.vector(strand(p2))=="+"]<-end(p2)[as.vector(strand(p2))=="+"]-500
start(p2)[as.vector(strand(p2))=="-"]<-start(p2)[as.vector(strand(p2))=="-"]+500
p3<-promoters(p1,upstream=0,downstream=5000)
end(p3)[as.vector(strand(p3))=="-"]<-end(p3)[as.vector(strand(p3))=="-"]-500
start(p3)[as.vector(strand(p3))=="+"]<-start(p3)[as.vector(strand(p3))=="+"]+500
p4<-promoters(p1,upstream=500,downstream=500)
##any knownGene overlap?
ol1<-findOverlaps(p1,pAnno)
ol1<-unlist(lapply(split(p1$symbol[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasUCSCknownGeneOverlap<-0
annoObj[names(ol1),"hasUCSCknownGeneOverlap"]<-as.integer(ol1!="")
annoObj$nameUCSCknownGeneOverlap<-""
annoObj[names(ol1),"nameUCSCknownGeneOverlap"]<-ol1
annoObj$numberUCSCknownGeneOverlap<-0
annoObj[names(ol1),"numberUCSCknownGeneOverlap"]<-unlist(lapply(ol1,function(x) length(unlist(strsplit(x,"_")))))
##promoter overlap
ol1<-findOverlaps(p4,pAnno)
ol1<-unlist(lapply(split(names(p4)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasUCSCknownGenePromOverlap<-0
annoObj[names(ol1),"hasUCSCknownGenePromOverlap"]<-as.integer(ol1!="")
##up5000 gene overlap
ol1<-findOverlaps(p2,pAnno)
ol1<-unlist(lapply(split(names(p2)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasUCSCknownGeneUp5kbOverlap<-0
annoObj[names(ol1),"hasUCSCknownGeneUp5kbOverlap"]<-as.integer(ol1!="")
##dn5000 gene overlap
ol1<-findOverlaps(p3,pAnno)
ol1<-unlist(lapply(split(names(p3)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasUCSCknownGeneDn5kbOverlap<-0
annoObj[names(ol1),"hasUCSCknownGeneDn5kbOverlap"]<-as.integer(ol1!="")
##any knownTx promoter overlap
p0<-transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
p0<-p0[as.character(seqnames(p0)) %in% paste0("chr",c(1:22,"X"))]
seqlevels(p0, pruning.mode="coarse") <- paste("chr",c(1:22,"X"),sep="")
p1<-p0
p1<-p1[as.character(seqnames(p1)) %in% paste0("chr",c(1:22,"X"))]
seqlevels(p1, pruning.mode="coarse") <- paste("chr",c(1:22,"X"),sep="")
p2<-promoters(p1,upstream=5000,downstream=0) ###Need to take orientation into account!!
end(p2)[as.vector(strand(p2))=="+"]<-end(p2)[as.vector(strand(p2))=="+"]-500
start(p2)[as.vector(strand(p2))=="-"]<-start(p2)[as.vector(strand(p2))=="-"]+500
names(p2)<-1:length(p2)
p3<-promoters(p1,upstream=0,downstream=5000)
end(p3)[as.vector(strand(p3))=="-"]<-end(p3)[as.vector(strand(p3))=="-"]-500
start(p3)[as.vector(strand(p3))=="+"]<-start(p3)[as.vector(strand(p3))=="+"]+500
names(p3)<-1:length(p3)
p4<-promoters(p1,upstream=500,downstream=500)
names(p4)<-1:length(p4)
ol1<-findOverlaps(p4,pAnno)
ol1<-unlist(lapply(split(names(p4)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasUCSCknownTxPromOverlap<-0
annoObj[names(ol1),"hasUCSCknownTxPromOverlap"]<-as.integer(ol1!="")
ol1<-findOverlaps(p2,pAnno)
ol1<-unlist(lapply(split(names(p2)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasUCSCknownTxUp5kbOverlap<-0
annoObj[names(ol1),"hasUCSCknownTxUp5kbOverlap"]<-as.integer(ol1!="")
##dn5000 gene overlap
ol1<-findOverlaps(p3,pAnno)
ol1<-unlist(lapply(split(names(p3)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasUCSCknownTxDn5kbOverlap<-0
annoObj[names(ol1),"hasUCSCknownTxDn5kbOverlap"]<-as.integer(ol1!="")
rm(sym,p0,p1,p2,p3,p4)
##may have some contradictory annotations b/c multiple transcripts may overlap for same or different genes
##add annotation "distal"
annoObj$ucscKnownGeneIsDistal<-0
annoObj[ annoObj$hasUCSCknownGeneUp5kbOverlap==0 &
annoObj$hasUCSCknownGeneDn5kbOverlap==0 &
annoObj$hasUCSCknownGenePromOverlap==0
,"ucscKnownGeneIsDistal"]<-1
annoObj$ucscKnownGeneIsGenic<-0
annoObj[ annoObj$hasUCSCknownGeneOverlap!=0
,"ucscKnownGeneIsGenic"]<-1
annoObj$ucscKnownGeneIsDistalNonGenic<-0
annoObj[ annoObj$hasUCSCknownGeneUp5kbOverlap==0 &
annoObj$hasUCSCknownGeneDn5kbOverlap==0 &
annoObj$hasUCSCknownGenePromOverlap==0 &
annoObj$hasUCSCknownGeneOverlap==0
,"ucscKnownGeneIsDistalNonGenic"]<-1
table(annoObj$ucscKnownGeneIsDistal,annoObj$ucscKnownGeneIsDistalNonGenic)
# 0 1
# 0 206341 0
# 1 138020 77007
annoObj$ucscKnownGeneIsGeneBody<-0
annoObj[ annoObj$hasUCSCknownGenePromOverlap==0 &
annoObj$hasUCSCknownGeneOverlap==1
,"ucscKnownGeneIsGeneBody"]<-1
table(annoObj$ucscKnownGeneIsDistal,annoObj$ucscKnownGeneIsGeneBody)
# 0 1
# 0 132249 74092
# 1 77007 138020
table(annoObj$ucscKnownGeneIsDistalNonGenic,annoObj$ucscKnownGeneIsGeneBody)
# 0 1
# 0 132249 212112
# 1 77007 0
##annotations do not strictly take into account that catogories are fulfulled for same gene -> but ok compromise still
################################################################################
##add more detailed annotations for genes with gex
##do overlaps
p1<-geneCoords
##use ensg+proper symbol as gene identiifer
names(p1)<-paste(geneCoords$gene_id2,geneCoords$gene_name,sep="|")
p1<-p1[as.character(seqnames(p1)) %in% paste0("chr",c(1:22,"X"))]
seqlevels(p1, pruning.mode="coarse") <- paste("chr",c(1:22,"X"),sep="")
p2<-promoters(p1,upstream=5000,downstream=0) ###Need to take orientation into account!!
end(p2)[as.vector(strand(p2))=="+"]<-end(p2)[as.vector(strand(p2))=="+"]-500
start(p2)[as.vector(strand(p2))=="-"]<-start(p2)[as.vector(strand(p2))=="-"]+500
p3<-promoters(p1,upstream=0,downstream=5000)
end(p3)[as.vector(strand(p3))=="-"]<-end(p3)[as.vector(strand(p3))=="-"]-500
start(p3)[as.vector(strand(p3))=="+"]<-start(p3)[as.vector(strand(p3))=="+"]+500
p4<-promoters(p1,upstream=500,downstream=500)
ol1<-findOverlaps(p1,pAnno)
ol1<-unlist(lapply(split(names(p1)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasGeneOverlap<-0
annoObj[names(ol1),"hasGeneOverlap"]<-as.integer(ol1!="")
annoObj$nameGeneOverlap<-""
annoObj[names(ol1),"nameGeneOverlap"]<-ol1
annoObj$numberGeneOverlap<-0
annoObj[names(ol1),"numberGeneOverlap"]<-unlist(lapply(ol1,function(x) length(unlist(strsplit(x,"_")))))
##up5000 gene overlap
ol1<-findOverlaps(p2,pAnno)
ol1<-unlist(lapply(split(names(p2)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasUp5kbOverlap<-0
annoObj[names(ol1),"hasUp5kbOverlap"]<-as.integer(ol1!="")
annoObj$nameUp5kbOverlap<-""
annoObj[names(ol1),"nameUp5kbOverlap"]<-ol1
annoObj$numberUp5kbOverlap<-0
annoObj[names(ol1),"numberUp5kbOverlap"]<-unlist(lapply(ol1,function(x) length(unlist(strsplit(x,"_")))))
##dn5000 gene overlap
ol1<-findOverlaps(p3,pAnno)
ol1<-unlist(lapply(split(names(p3)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasDn5kbOverlap<-0
annoObj[names(ol1),"hasDn5kbOverlap"]<-as.integer(ol1!="")
annoObj$nameDn5kbOverlap<-""
annoObj[names(ol1),"nameDn5kbOverlap"]<-ol1
annoObj$numberDn5kbOverlap<-0
annoObj[names(ol1),"numberDn5kbOverlap"]<-unlist(lapply(ol1,function(x) length(unlist(strsplit(x,"_")))))
##prom gene overlap
ol1<-findOverlaps(p4,pAnno)
ol1<-unlist(lapply(split(names(p4)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="_")))
annoObj$hasPromOverlap<-0
annoObj[names(ol1),"hasPromOverlap"]<-as.integer(ol1!="")
annoObj$namePromOverlap<-""
annoObj[names(ol1),"namePromOverlap"]<-ol1
annoObj$numberPromOverlap<-0
annoObj[names(ol1),"numberPromOverlap"]<-unlist(lapply(ol1,function(x) length(unlist(strsplit(x,"_")))))
##atac overlap
ol1<-findOverlaps(atacRanges,pAnno)
ol1<-unlist(lapply(split(names(atacRanges)[queryHits(ol1)],names(pAnno)[subjectHits(ol1)]),function(x) paste(unique(x),collapse="|")))
annoObj$hasAtacOverlap<-0
annoObj[names(ol1),"hasAtacOverlap"]<-as.integer(ol1!="")
annoObj$nameAtacOverlap<-""
annoObj[names(ol1),"nameAtacOverlap"]<-ol1
annoObj$numberAtacOverlap<-0
annoObj[names(ol1),"numberAtacOverlap"]<-unlist(lapply(ol1,function(x) length(unlist(strsplit(x,"\\|")))))
table(annoObj$hasAtacOverlap,annoObj$hasGeneOverlap)
# 0 1
# 0 75244 200010
# 1 42614 103500
table(annoObj$hasAtacOverlap,annoObj$hasUp5kbOverlap | annoObj$hasDn5kbOverlap)
# FALSE TRUE
# 0 170853 104401
# 1 81828 64286
table(annoObj$hasUp5kbOverlap , annoObj$hasDn5kbOverlap)
# 0 1
# 0 252681 73308
# 1 58080 37299 ##some overlap 2 proms or more
table(annoObj$hasDn5kbOverlap,annoObj$hasPromOverlap)
# 0 1
# 0 212469 98292
# 1 86190 24417
table(annoObj$hasUp5kbOverlap,annoObj$hasPromOverlap)
# 0 1
# 0 225043 100946
# 1 73616 21763
table(annoObj$hasAtacOverlap,annoObj$hasPromOverlap)
# 0 1
# 0 226106 49148
# 1 72553 73561
##add annotation "distal"
annoObj$isDistal<-0
annoObj[ annoObj$hasUp5kbOverlap==0 &
annoObj$hasDn5kbOverlap==0 &
annoObj$hasPromOverlap==0
,"isDistal"]<-1
annoObj$isGenic<-0
annoObj[ annoObj$hasGeneOverlap!=0
,"isGenic"]<-1
annoObj$isDistalNonGenic<-0
annoObj[ annoObj$hasUp5kbOverlap==0 &
annoObj$hasDn5kbOverlap==0 &
annoObj$hasPromOverlap==0 &
annoObj$hasGeneOverlap==0
,"isDistalNonGenic"]<-1
annoObj$isGeneBody<-0
annoObj[ annoObj$hasPromOverlap==0 &
annoObj$hasGeneOverlap==1
#& annoObj$hasUp5kbOverlap==0 #if this hashed then can be body even if up5k for other gene
,"isGeneBody"]<-1
str(annoObj)
# 'data.frame': 421368 obs. of 35 variables:
# $ illuminaID : chr "cg21870274" "cg08258224" "cg16619049" "cg18147296" ...
# $ chr : chr "chr1" "chr1" "chr1" "chr1" ...
# $ start : num 69591 864703 870161 877159 898803 ...
# $ end : num 69592 864704 870162 877160 898804 ...
# $ hasUCSCknownGeneOverlap : num 1 0 1 1 0 0 0 0 0 0 ...
# $ nameUCSCknownGeneOverlap : chr "OR4F5" "" "FAM41C" "FAM41C" ...
# $ numberUCSCknownGeneOverlap : num 1 0 1 1 0 0 0 0 0 0 ...
# $ hasUCSCknownGeneUp5kbOverlap : num 0 0 0 0 0 0 0 0 0 0 ...
# $ hasUCSCknownGeneDn5kbOverlap : num 1 0 0 0 0 0 0 0 0 0 ...
# $ hasUCSCknownTxPromOverlap : num 0 0 1 1 0 0 0 0 0 0 ...
# $ hasUCSCknownTxUp5kbOverlap : num 0 0 1 0 0 0 0 1 1 1 ...
# $ hasUCSCknownTxDn5kbOverlap : num 1 1 0 1 0 0 0 0 0 0 ...
# $ ucscKnownGeneIsDistal : num 0 1 0 0 1 1 1 1 1 1 ...
# $ ucscKnownGeneIsGenic : num 1 0 1 1 0 0 0 0 0 0 ...
# $ ucscKnownGeneIsDistalNonGenic: num 0 1 0 0 1 1 1 1 1 1 ...
# $ ucscKnownGeneIsGeneBody : num 1 0 0 0 0 0 0 0 0 0 ...
# $ hasGeneOverlap : num 1 0 1 0 0 0 0 0 0 0 ...
# $ nameGeneOverlap : chr "ENSG00000186092|OR4F5" "" "ENSG00000230368|FAM41C" "" ...
# $ numberGeneOverlap : num 1 0 1 0 0 0 0 0 0 0 ...
# $ hasUp5kbOverlap : num 0 0 1 0 0 0 0 1 1 1 ...
# $ nameUp5kbOverlap : chr "" "" "ENSG00000234711|TUBB8P11" "" ...
# $ numberUp5kbOverlap : num 0 0 1 0 0 0 0 1 1 1 ...
# $ hasDn5kbOverlap : num 1 0 0 1 0 0 0 0 0 0 ...
# $ nameDn5kbOverlap : chr "ENSG00000186092|OR4F5" "" "" "ENSG00000234711|TUBB8P11" ...
# $ numberDn5kbOverlap : num 1 0 0 1 0 0 0 0 0 0 ...
# $ hasPromOverlap : num 0 0 0 1 0 0 0 0 0 0 ...
# $ namePromOverlap : chr "" "" "" "ENSG00000230368|FAM41C" ...
# $ numberPromOverlap : num 0 0 0 1 0 0 0 0 0 0 ...
# $ hasAtacOverlap : num 0 0 1 0 0 0 0 0 0 0 ...
# $ nameAtacOverlap : chr "" "" "chr1:869670-870169" "" ...
# $ numberAtacOverlap : num 0 0 1 0 0 0 0 0 0 0 ...
# $ isDistal : num 0 1 0 0 1 1 1 0 0 0 ...
# $ isGenic : num 1 0 1 0 0 0 0 0 0 0 ...
# $ isDistalNonGenic : num 0 1 0 0 1 1 1 0 0 0 ...
# $ isGeneBody : num 1 0 1 0 0 0 0 0 0 0 ...
table(annoObj$chr,annoObj$hasAtacOverlap)
# 0 1
# chr1 26067 15031
# chr10 14843 6363
# chr11 16482 9151
# chr12 13967 7657
# chr13 7819 2721
# chr14 8868 4336
# chr15 8619 4500
# chr16 12708 5919
# chr17 14455 9901
# chr18 3378 1902
# chr19 12321 9663
# chr2 20496 10054
# chr20 5465 3889
# chr21 2158 1220
# chr22 4383 2839
# chr3 14211 8104
# chr4 12459 5187
# chr5 14651 6628
# chr6 19948 11222