1919
2020#define VERSION " 0.1"
2121
22+
23+
24+
2225htsFormat *dingding2 =(htsFormat*) calloc(1 ,sizeof (htsFormat));
2326htsThreadPool p = {NULL , 0 };
2427int nthreads = 8 ;
2528char out_mode[5 ]=" ws" ;
2629
30+ int matched_bases (const bam1_t *b){
31+ int matched = 0 ;
32+ uint32_t *cigar = bam_get_cigar (b);
33+ int n_cigar = b->core .n_cigar ;
34+
35+ for (int i = 0 ; i < n_cigar; i++) {
36+ int op = bam_cigar_op (cigar[i]);
37+ int len = bam_cigar_oplen (cigar[i]);
38+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF)
39+ matched += len;
40+ }
41+ return matched;
42+ }
43+
44+
45+
2746// get ref used not get refused
28- int VERB= 5 ;
29- size_t getrefused (samFile *htsfp,bam_hdr_t *hdr,int *keeplist,int &nkeep){
47+ int VERB[ 2 ]={ 5 , 5 } ;
48+ size_t getrefused (samFile *htsfp,bam_hdr_t *hdr,int *keeplist,int &nkeep, int minmatch ){
3049 // now mainloop
3150 bam1_t *aln = bam_init1 ();
3251
@@ -36,10 +55,17 @@ size_t getrefused(samFile *htsfp,bam_hdr_t *hdr,int *keeplist,int &nkeep){
3655 fprintf (stderr," \r\t -> Now at read: %lu " ,at);
3756 int chr = aln->core .tid ; // contig name (chromosome)
3857 if (chr<0 ){
39- if (VERB>0 )
40- fprintf (stderr," \t -> Problem with unmapped reads, these will be discarded. Msg shows more: %d\n " ,VERB--);
58+ if (VERB[ 0 ] >0 )
59+ fprintf (stderr," \t -> Problem with unmapped reads, these will be discarded. Msg shows more: %d\n " ,VERB[ 0 ] --);
4160 continue ;
4261 }
62+ int nmatch = matched_bases (aln);
63+ if (nmatch<minmatch){
64+ if (VERB[1 ]>0 )
65+ fprintf (stderr," \t -> Problem with short <%d alns, these will be discarded. Msg shows more: %d\n " ,minmatch,VERB[1 ]--);
66+ continue ;
67+ }
68+
4369 keeplist[chr] = keeplist[chr]+1 ;
4470 chr =aln->core .mtid ;
4571 if (chr!=-1 )
@@ -53,8 +79,8 @@ size_t getrefused(samFile *htsfp,bam_hdr_t *hdr,int *keeplist,int &nkeep){
5379 return at;
5480}
5581
56- int VERB2= 5 ;
57- void writemod (const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,char *mycl){
82+ int VERB2[ 2 ]={ 5 , 5 } ;
83+ void writemod (const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,char *mycl, int minmatch ){
5884 BGZF *fp = NULL ;
5985 fp=bgzf_open (outfile," w5" );// write bgzf with compression level5. Maybe this should be changed.
6086 bgzf_mt (fp,nthreads,256 );
@@ -77,7 +103,7 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
77103 kputs (kstmp.s ,&newhdr_ks);
78104 kputc (' \n ' ,&newhdr_ks);
79105 if (newhdr_ks.l >10000000 ){
80- assert (bgzf_write (fp,newhdr_ks.s ,newhdr_ks.l )==newhdr_ks.l );
106+ assert (bgzf_write (fp,newhdr_ks.s ,newhdr_ks.l )==( ssize_t ) newhdr_ks.l );
81107 newhdr_ks.l =0 ;
82108 }
83109
@@ -89,7 +115,7 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
89115 kputs (kstmp.s ,&newhdr_ks);
90116 kputc (' \n ' ,&newhdr_ks);
91117 if (newhdr_ks.l >10000000 ){
92- assert (bgzf_write (fp,newhdr_ks.s ,newhdr_ks.l )==newhdr_ks.l );
118+ assert (bgzf_write (fp,newhdr_ks.s ,newhdr_ks.l )==( ssize_t ) newhdr_ks.l );
93119 newhdr_ks.l =0 ;
94120 }
95121 }
@@ -107,7 +133,7 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
107133 }
108134
109135
110- assert (bgzf_write (fp,newhdr_ks.s ,newhdr_ks.l )==newhdr_ks.l );
136+ assert (bgzf_write (fp,newhdr_ks.s ,newhdr_ks.l )==( ssize_t ) newhdr_ks.l );
111137 free (newhdr_ks.s );
112138 free (kstmp.s );
113139 bgzf_close (fp);
@@ -149,6 +175,7 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
149175 sam_hdr_add_pg (newhdr," compressbam" ," VN" ,VERSION," CL" ,mycl,NULL );
150176
151177 int ret = sam_hdr_write (outhts,newhdr);
178+ assert (ret>=0 );
152179 fprintf (stderr," \t -> Done writing new header as binary\n " );fflush (stderr);
153180 // now mainloop
154181 bam1_t *aln = bam_init1 ();
@@ -160,11 +187,18 @@ void writemod(const char *outfile ,bam_hdr_t *hdr,int *keeplist,samFile *htsfp,c
160187 fflush (stderr);
161188 }
162189 if (aln->core .tid <0 ){
163- if (VERB2>0 )
164- fprintf (stderr," Unmapped read in bamwriting part, this msg will be snown: %d more\n " ,VERB2--);
165- continue ;
190+ if (VERB2[0 ]>0 )
191+ fprintf (stderr," Unmapped read in bamwriting part, this msg will be snown: %d more\n " ,VERB2[0 ]--);
192+ continue ;
193+ }
194+ int nmatch = matched_bases (aln);
195+ if (nmatch<minmatch){
196+ if (VERB2[1 ]>0 )
197+ fprintf (stderr," \t -> Problem with short <%d alns, these will be discarded. Msg shows more: %d\n " ,minmatch,VERB2[1 ]--);
198+ continue ;
166199 }
167- aln->core .tid = keeplist[aln->core .tid ];
200+
201+ aln->core .tid = keeplist[aln->core .tid ];
168202 if (aln->core .mtid !=-1 )
169203 aln->core .mtid = keeplist[aln->core .mtid ];
170204 assert (sam_write1 (outhts, newhdr,aln)>=0 );
@@ -184,18 +218,17 @@ int main(int argc,char**argv){
184218 time_t t2=time (NULL );
185219
186220 if (argc==1 ){
187- fprintf (stderr," ./compressbam --threads <INT> --input <FILE> --ref <FILE> --output <FILE>\n " );
221+ fprintf (stderr," ./compressbam --threads <INT> --input <FILE> --ref <FILE> --output <FILE> [--min-match <int>] \n " );
188222 return -1 ;
189223 }
190224 char *mycl = stringify_argv (argc,argv);
191225 fprintf (stderr," \t -> compressbam: (%s;%s;%s): \' %s\'\n " ,__FILE__,__DATE__,__TIME__,mycl);
192226
193227 argv++;
194228 char *hts = NULL ;
195- char *names = NULL ;
196229 char *ref = NULL ;
197230 char *outfile = strdup (" tmp.sam" );
198-
231+ int minmatch = 30 ;
199232 // char out_mode[5]="ws";
200233 // int nthreads = 4; //this is now global
201234 while (*argv){
@@ -209,6 +242,7 @@ int main(int argc,char**argv){
209242 }
210243 else if (!strcasecmp (" --ref" ,key)) ref=strdup (val);
211244 else if (!strcasecmp (" -n" ,key) || !strcasecmp (" --threads" ,key)) nthreads=atoi (val);
245+ else if (!strcasecmp (" -m" ,key) || !strcasecmp (" --min-match" ,key)) minmatch=atoi (val);
212246 else {
213247 fprintf (stderr," \t Unknown parameter key:%s val:%s\n " ,key,val);
214248 return -1 ;
@@ -231,14 +265,14 @@ int main(int argc,char**argv){
231265 int *keeplist = new int [sam_hdr_nref (hdr)];
232266 for (int i=0 ;i<sam_hdr_nref (hdr);i++)
233267 keeplist[i] = -1 ;
234- size_t nalign = getrefused (htsfp,hdr,keeplist,nkeep);
268+ size_t nalign = getrefused (htsfp,hdr,keeplist,nkeep,minmatch );
235269 fprintf (stderr," \t -> Number of alignments parsed: %lu\n " ,nalign);
236270 sam_close (htsfp);
237271
238272 sam_hdr_destroy (hdr);
239273 htsfp=hts_open (hts," r" );
240274 hdr = sam_hdr_read (htsfp);
241- writemod (outfile,hdr,keeplist,htsfp,mycl);
275+ writemod (outfile,hdr,keeplist,htsfp,mycl,minmatch );
242276 fprintf (stderr," \t -> Done writing file: \' %s\'\n " ,outfile);
243277 free (mycl);
244278 sam_close (htsfp);
0 commit comments