Skip to content

Commit b0169f5

Browse files
author
fvr124
committed
Added option to remove alns with too few matching part. Did also some cleanup
1 parent 2292012 commit b0169f5

1 file changed

Lines changed: 52 additions & 18 deletions

File tree

misc/compressbam.cpp

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,33 @@
1919

2020
#define VERSION "0.1"
2121

22+
23+
24+
2225
htsFormat *dingding2 =(htsFormat*) calloc(1,sizeof(htsFormat));
2326
htsThreadPool p = {NULL, 0};
2427
int nthreads = 8;
2528
char out_mode[5]="ws";
2629

30+
int matched_bases(const bam1_t *b){
31+
int matched = 0;
32+
uint32_t *cigar = bam_get_cigar(b);
33+
int n_cigar = b->core.n_cigar;
34+
35+
for (int i = 0; i < n_cigar; i++) {
36+
int op = bam_cigar_op(cigar[i]);
37+
int len = bam_cigar_oplen(cigar[i]);
38+
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF)
39+
matched += len;
40+
}
41+
return matched;
42+
}
43+
44+
45+
2746
//get ref used not get refused
28-
int VERB=5;
29-
size_t getrefused(samFile *htsfp,bam_hdr_t *hdr,int *keeplist,int &nkeep){
47+
int VERB[2]={5,5};
48+
size_t getrefused(samFile *htsfp,bam_hdr_t *hdr,int *keeplist,int &nkeep,int minmatch){
3049
//now mainloop
3150
bam1_t *aln = bam_init1();
3251

@@ -36,10 +55,17 @@ size_t getrefused(samFile *htsfp,bam_hdr_t *hdr,int *keeplist,int &nkeep){
3655
fprintf(stderr,"\r\t-> Now at read: %lu ",at);
3756
int chr = aln->core.tid ; //contig name (chromosome)
3857
if(chr<0){
39-
if(VERB>0)
40-
fprintf(stderr,"\t-> Problem with unmapped reads, these will be discarded. Msg shows more: %d\n",VERB--);
58+
if(VERB[0]>0)
59+
fprintf(stderr,"\t-> Problem with unmapped reads, these will be discarded. Msg shows more: %d\n",VERB[0]--);
4160
continue;
4261
}
62+
int nmatch = matched_bases(aln);
63+
if(nmatch<minmatch){
64+
if(VERB[1]>0)
65+
fprintf(stderr,"\t-> Problem with short <%d alns, these will be discarded. Msg shows more: %d\n",minmatch,VERB[1]--);
66+
continue;
67+
}
68+
4369
keeplist[chr] = keeplist[chr]+1;
4470
chr =aln->core.mtid;
4571
if(chr!=-1)
@@ -53,8 +79,8 @@ size_t getrefused(samFile *htsfp,bam_hdr_t *hdr,int *keeplist,int &nkeep){
5379
return at;
5480
}
5581

56-
int VERB2=5;
57-
void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,char *mycl){
82+
int VERB2[2]={5,5};
83+
void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,char *mycl,int minmatch){
5884
BGZF *fp = NULL;
5985
fp=bgzf_open(outfile,"w5");//write bgzf with compression level5. Maybe this should be changed.
6086
bgzf_mt(fp,nthreads,256);
@@ -77,7 +103,7 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
77103
kputs(kstmp.s,&newhdr_ks);
78104
kputc('\n',&newhdr_ks);
79105
if(newhdr_ks.l>10000000){
80-
assert(bgzf_write(fp,newhdr_ks.s,newhdr_ks.l)==newhdr_ks.l);
106+
assert(bgzf_write(fp,newhdr_ks.s,newhdr_ks.l)==(ssize_t)newhdr_ks.l);
81107
newhdr_ks.l =0;
82108
}
83109

@@ -89,7 +115,7 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
89115
kputs(kstmp.s,&newhdr_ks);
90116
kputc('\n',&newhdr_ks);
91117
if(newhdr_ks.l>10000000){
92-
assert(bgzf_write(fp,newhdr_ks.s,newhdr_ks.l)==newhdr_ks.l);
118+
assert(bgzf_write(fp,newhdr_ks.s,newhdr_ks.l)==(ssize_t)newhdr_ks.l);
93119
newhdr_ks.l =0;
94120
}
95121
}
@@ -107,7 +133,7 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
107133
}
108134

109135

110-
assert(bgzf_write(fp,newhdr_ks.s,newhdr_ks.l)==newhdr_ks.l);
136+
assert(bgzf_write(fp,newhdr_ks.s,newhdr_ks.l)==(ssize_t)newhdr_ks.l);
111137
free(newhdr_ks.s);
112138
free(kstmp.s);
113139
bgzf_close(fp);
@@ -149,6 +175,7 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
149175
sam_hdr_add_pg(newhdr,"compressbam","VN",VERSION,"CL",mycl,NULL);
150176

151177
int ret = sam_hdr_write(outhts,newhdr);
178+
assert(ret>=0);
152179
fprintf(stderr,"\t-> Done writing new header as binary\n");fflush(stderr);
153180
//now mainloop
154181
bam1_t *aln = bam_init1();
@@ -160,11 +187,18 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
160187
fflush(stderr);
161188
}
162189
if(aln->core.tid<0){
163-
if(VERB2>0)
164-
fprintf(stderr,"Unmapped read in bamwriting part, this msg will be snown: %d more\n",VERB2--);
165-
continue;
190+
if(VERB2[0]>0)
191+
fprintf(stderr,"Unmapped read in bamwriting part, this msg will be snown: %d more\n",VERB2[0]--);
192+
continue;
193+
}
194+
int nmatch = matched_bases(aln);
195+
if(nmatch<minmatch){
196+
if(VERB2[1]>0)
197+
fprintf(stderr,"\t-> Problem with short <%d alns, these will be discarded. Msg shows more: %d\n",minmatch,VERB2[1]--);
198+
continue;
166199
}
167-
aln->core.tid = keeplist[aln->core.tid];
200+
201+
aln->core.tid = keeplist[aln->core.tid];
168202
if(aln->core.mtid!=-1)
169203
aln->core.mtid = keeplist[aln->core.mtid];
170204
assert(sam_write1(outhts, newhdr,aln)>=0);
@@ -184,18 +218,17 @@ int main(int argc,char**argv){
184218
time_t t2=time(NULL);
185219

186220
if(argc==1){
187-
fprintf(stderr,"./compressbam --threads <INT> --input <FILE> --ref <FILE> --output <FILE>\n");
221+
fprintf(stderr,"./compressbam --threads <INT> --input <FILE> --ref <FILE> --output <FILE> [--min-match <int>]\n");
188222
return -1;
189223
}
190224
char *mycl = stringify_argv(argc,argv);
191225
fprintf(stderr,"\t-> compressbam: (%s;%s;%s): \'%s\'\n",__FILE__,__DATE__,__TIME__,mycl);
192226

193227
argv++;
194228
char *hts = NULL;
195-
char *names = NULL;
196229
char *ref = NULL;
197230
char *outfile = strdup("tmp.sam");
198-
231+
int minmatch = 30;
199232
// char out_mode[5]="ws";
200233
// int nthreads = 4; //this is now global
201234
while(*argv){
@@ -209,6 +242,7 @@ int main(int argc,char**argv){
209242
}
210243
else if(!strcasecmp("--ref",key)) ref=strdup(val);
211244
else if(!strcasecmp("-n",key) || !strcasecmp("--threads",key)) nthreads=atoi(val);
245+
else if(!strcasecmp("-m",key) || !strcasecmp("--min-match",key)) minmatch=atoi(val);
212246
else{
213247
fprintf(stderr,"\t Unknown parameter key:%s val:%s\n",key,val);
214248
return -1;
@@ -231,14 +265,14 @@ int main(int argc,char**argv){
231265
int *keeplist = new int[sam_hdr_nref(hdr)];
232266
for(int i=0;i<sam_hdr_nref(hdr);i++)
233267
keeplist[i] = -1;
234-
size_t nalign = getrefused(htsfp,hdr,keeplist,nkeep);
268+
size_t nalign = getrefused(htsfp,hdr,keeplist,nkeep,minmatch);
235269
fprintf(stderr,"\t-> Number of alignments parsed: %lu\n",nalign);
236270
sam_close(htsfp);
237271

238272
sam_hdr_destroy(hdr);
239273
htsfp=hts_open(hts,"r" );
240274
hdr = sam_hdr_read(htsfp);
241-
writemod(outfile,hdr,keeplist,htsfp,mycl);
275+
writemod(outfile,hdr,keeplist,htsfp,mycl,minmatch);
242276
fprintf(stderr,"\t-> Done writing file: \'%s\'\n",outfile);
243277
free(mycl);
244278
sam_close(htsfp);

0 commit comments

Comments
 (0)