bash/Training_render at master · nishuai/bash · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#################################################################
# File name: Training_rendering.R
# Render training sets to have the same format for easier bg error rate estimation
# Author: Ni Shuai nishuai@yahoo.com
# YuceBio R&D department
# Input: directory of training pileup files, reference sequnce used in the panel,
# bed file defining coordinates of the panel, output directory (gwd by default)
#################################################################

####get input parameters
print('Start computing')
args = commandArgs(trailingOnly=TRUE)
training_dir =NULL; output_dir=NULL
if (length(args)<6) {
  stop("Usage: Rscript Training_rendering.R training_dir ref_seq bed_file output_dir", call.=FALSE)
} else if (length(args)>4) {
  stop("Usage: Rscript Training_rendering.R training_dir ref_seq bed_file output_dir", call.=FALSE)
}

training_dir = args[1]
output_dir= args[4]
if (is.null(output_dir)) output_dir='.'
training_files=list.files(path=training_dir,pattern="pileup.txt$") # list all healthy subject pileup data files; require at least two or more normal samples
print(training_files)

###generate a master atgc file (each training may have missing positions)

ref_seq=scan(args[2], what='character')
ref_seq=ref_seq[seq(2, length(ref_seq), 2)]
ref_seq=paste0(ref_seq, collapse = '')

bed_file=read.table(args[3], stringsAsFactors = FALSE)
all_positions=unlist(apply(bed_file,1, function(x) {(as.numeric(x[2])+1):x[3]}))
chr=rep(bed_file$V1, c(bed_file$V3 - bed_file$V2))

master_acgt=data.frame(chr=chr, position=all_positions, ref_base=ref_seq)
master_acgt$id=paste(master_acgt$chr, master_acgt$position, sep='_')
saveRDS(master_acgt, 'master_acgt.rds')

####read into each training acgt data and render for missing bases
All_training=list()
for (filename in training_files){
  print(paste('processing ', filename))

  training=read.table(file.path(training_dir, filename), stringsAsFactors = FALSE, header = TRUE)
  training=training[,-dim(training)[2]]
  ###create a unified identifier
  training$id=paste(training$chr,training$n_base, sep='_')
  bb=merge(master_acgt, training, by='id', all=TRUE)
  row.names(bb)=bb$id
  bb=bb[,c(2,3,4,9:12)]
  colnames(bb)=c('CHR','POSITION','REF','A','C','G','T')
  All_training[[filename]]=bb
}

saveRDS(All_training, 'All_training.rds')

####output back ground error rate file
setwd('D:/wrk/TNER-master/')
All_training=readRDS('All_training.rds')
##For each training sample, not covered regions are represented as 0, the mean
##error rate estimation for missing regions would be estimated by other samples
nt_matrix=lapply(All_training, function(x) {a=x[,4:7]; a[is.na(a)]=0; return (a)})


All_nt_matrix=Reduce('+', nt_matrix)
###Check the coverage of all training samples
###averge coverage of each 10 bases
pdf('overall_training_coverage.pdf')
aa=rowSums(All_nt_matrix)
plot(colMeans(matrix(aa, nrow=300)), main='Overall coverage of all training \n samples, every 300 nt', pch='.', type='l')
dev.off()

##generate site specific bg error rate
All_nt_rate=All_nt_matrix/rowSums(All_nt_matrix)
hs_bg_ave=cbind(All_training[[1]][,1:3],All_nt_rate)
saveRDS(hs_bg_ave, 'All_bg_ave.rds')
##generate depth count for each base
depth_count=matrix(rep(rowSums(All_nt_matrix), 4), ncol=4)
rownames(depth_count)= rownames(All_nt_matrix)
colnames(depth_count)=colnames(All_nt_matrix)
saveRDS(depth_count, 'All_depth_count.rds')