diff --git a/Makefile b/Makefile index 88af18adc..ef5d61147 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ CC = gcc AR = ar RANLIB = ranlib - +SOURCE_DIR = . # Default libraries to link if configure is not used htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl @@ -35,10 +35,20 @@ CPPFLAGS = # TODO: probably update cram code to make it compile cleanly with -Wc++-compat # For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600 #CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 -CFLAGS = -g -Wall -O2 -fvisibility=hidden +ifdef DEBUG + CFLAGS = -DDEBUG -g3 -gdwarf-3 + LDFLAGS = -g3 -gdwarf-3 +else + CFLAGS = -O3 + LDFLAGS = +endif +ifdef PROFILE + CFLAGS += -pg +endif +CFLAGS += -Wall -fPIC EXTRA_CFLAGS_PIC = -fpic TARGET_CFLAGS = -LDFLAGS = -fvisibility=hidden +LDFLAGS = VERSION_SCRIPT_LDFLAGS = -Wl,-version-script,$(srcprefix)htslib.map LIBS = $(htslib_default_libs) @@ -123,7 +133,7 @@ htscodecs.mk: $(srcdir)/hts_probe_cc.sh '$(CC)' '$(CFLAGS) $(CPPFLAGS)' '$(LDFLAGS)' >> $@ srcdir = . -srcprefix = +srcprefix = $(SOURCE_DIR)/ HTSPREFIX = # Flags for SIMD code @@ -138,7 +148,7 @@ HTS_BUILD_SSSE3 = HTS_BUILD_POPCNT = HTS_BUILD_SSE4_1 = -include htslib_vars.mk +include $(SOURCE_DIR)/htslib_vars.mk include htscodecs.mk # If not using GNU make, you need to copy the version number from version.sh @@ -189,10 +199,10 @@ config_vars.h: .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: - $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< + $(CC) $(CFLAGS) -I$(SOURCE_DIR) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< .c.pico: - $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< + $(CC) $(CFLAGS) -I$(SOURCE_DIR) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< LIBHTS_OBJS = \ diff --git a/faidx.c b/faidx.c index 5dd4bf1c0..089001dd3 100644 --- a/faidx.c +++ b/faidx.c @@ -914,6 +914,64 @@ int fai_adjust_region(const faidx_t *fai, int tid, return ((orig_beg != *beg ? 1 : 0) | (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0)); } +static void fai_retrieve_into_buffer(const faidx_t *fai, const faidx1_t *val, + const uint64_t offset, const hts_pos_t beg, const hts_pos_t end, + char* s, hts_pos_t *len) { + size_t l; + int c = 0; + int ret; + + if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) { + hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end); + *len = -1; + return; + } + + if (val->line_blen <= 0) { + hts_log_error("Invalid line length in index: %d", val->line_blen); + *len = -1; + return; + } + + ret = bgzf_useek(fai->bgzf, + offset + + beg / val->line_blen * val->line_len + + beg % val->line_blen, SEEK_SET); + + if (ret < 0) { + *len = -1; + hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)"); + return; + } + + l = 0; + + while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 ) + if (isgraph(c)) s[l++] = c; + if (c < 0) { + hts_log_error("Failed to retrieve block: %s", + c == -1 ? "unexpected end of file" : "error reading file"); + *len = -1; + return; + } + + s[l] = '\0'; + *len = l; +} + +void faidx_fetch_seq_into_buffer(const faidx_t *fai, + const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, char* s, hts_pos_t *len) +{ + faidx1_t val; + + // Adjust position + if (faidx_adjust_position(fai, 1,&val, c_name, &p_beg_i, &p_end_i, len)) { + *len = 0; + return; + } + + fai_retrieve_into_buffer(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, s, len); +} char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { diff --git a/hfile.c b/hfile.c index 78533dd56..82451fe71 100644 --- a/hfile.c +++ b/hfile.c @@ -1131,7 +1131,7 @@ static hFILE *hopen_unknown_scheme(const char *fname, const char *mode) } /* Returns the appropriate handler, or NULL if the string isn't an URL. */ -static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) +const struct hFILE_scheme_handler *find_scheme_handler(const char *s) { static const struct hFILE_scheme_handler unknown_scheme = { hopen_unknown_scheme, hfile_always_local, "built-in", 0 }; diff --git a/htscodecs b/htscodecs index 11b5007ff..dcb331678 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 11b5007ffb68bea9f6c777874a215e4187ce659a +Subproject commit dcb33167839622903897fc985a8cccf89b3358e2 diff --git a/htslib/faidx.h b/htslib/faidx.h index 4351b3fbe..12f3f4b40 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -237,6 +237,10 @@ by end users by calling `free()` on it. HTSLIB_EXPORT char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +void faidx_fetch_seq_into_buffer(const faidx_t *fai, + const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, + char* s, hts_pos_t *len); + /// Fetch the sequence in a region /** @param fai Pointer to the faidx_t struct @param c_name Region name diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 9a6b48438..58d3d9389 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -141,6 +141,7 @@ typedef struct bcf_sr_t { htsFile *file; tbx_t *tbx_idx; + unsigned char read_one_record_only; hts_idx_t *bcf_idx; bcf_hdr_t *header; hts_itr_t *itr; diff --git a/htslib/vcf.h b/htslib/vcf.h index 70cf95372..908f15b65 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -63,14 +63,22 @@ extern "C" { #define BCF_HT_INT 1 #define BCF_HT_REAL 2 #define BCF_HT_STR 3 -#define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only! +#define BCF_HT_UINT 4 +#define BCF_HT_CHAR 5 +#define BCF_HT_INT64 6 +#define BCF_HT_LONG BCF_HT_INT64 // BCF_HT_INT, but for int64_t values; VCF only! +#define BCF_HT_UINT64 7 +#define BCF_HT_VOID 8 +#define BCF_HT_DOUBLE 9 +#define BCF_NUM_HT_TYPES 10 #define BCF_VL_FIXED 0 // variable length #define BCF_VL_VAR 1 #define BCF_VL_A 2 #define BCF_VL_G 3 #define BCF_VL_R 4 - +#define BCF_VL_P 5 //ploidy +#define BCF_VL_Phased_Ploidy 6 //ploidy with phase /* === Dictionary === The header keeps three dictionaries. The first keeps IDs in the @@ -87,6 +95,10 @@ extern "C" { #define BCF_DT_CTG 1 #define BCF_DT_SAMPLE 2 +#define BCF_V_2_1_HEADER_MAGIC_STRING "BCF\2\1" +#define BCF_V_2_2_HEADER_MAGIC_STRING "BCF\2\2" +#define BCF_HEADER_MAGIC_STRING_LENGTH 5 + // Complete textual representation of a header line typedef struct bcf_hrec_t { int type; // One of the BCF_HL_* type @@ -144,10 +156,12 @@ extern uint8_t bcf_type_shift[]; #define VCF_INDEL (1<<2) #define VCF_OTHER (1<<3) #define VCF_BND (1<<4) // breakend -#define VCF_OVERLAP (1<<5) // overlapping deletion, ALT=* +#define VCF_OVERLAP (1<<5) // overlapping deletion, ALT=* +#define VCF_SPANNING_DELETION VCF_OVERLAP #define VCF_INS (1<<6) // implies VCF_INDEL #define VCF_DEL (1<<7) // implies VCF_INDEL #define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) +#define VCF_NON_REF (1<<8) typedef struct bcf_variant_t { int type, n; // variant type and the number of bases affected, negative for deletions @@ -237,6 +251,7 @@ typedef struct bcf1_t { hts_pos_t rlen; // length of REF int32_t rid; // CHROM float qual; // QUAL + hts_pos_t m_end_point; //END - must be after QUAL due to a memcpy() in vcf.c uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; kstring_t shared, indiv; @@ -338,6 +353,7 @@ typedef struct bcf1_t { */ HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_read(htsFile *fp) HTS_RESULT_USED; + bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_sample_line_required); /** * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed @@ -375,6 +391,16 @@ typedef struct bcf1_t { HTSLIB_EXPORT int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) HTS_RESULT_USED; + + /* + * Serialize BCF header into buffer + * + * Returns new offset value in buffer if the new data fits within the buffer capacity, + * else returns the same offset value without modifying the buffer + */ + size_t bcf_hdr_serialize(bcf_hdr_t* h, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const uint8_t keep_idx_fields); + + size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf); /** * Parse VCF line contained in kstring and populate the bcf1_t struct * The line must not end with \n or \r characters. @@ -396,6 +422,27 @@ typedef struct bcf1_t { HTSLIB_EXPORT int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s); + /* + * Same as vcf_format, but for bcfs + * + * Returns new offset value in buffer if the new data fits within the buffer capacity, + * else returns the same offset value without modifying the buffer + * + * If vcf, then the hdr and tmp pointers must be valid. For bcfs, they might be null + */ + size_t bcf_serialize(bcf1_t* v, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr, kstring_t* tmp); + /* + * Same as vcf_parse, but for bcfs + * + * Returns new offset value in buffer if a full vcf record is read, + * else returns the same offset value + * + * If vcf, then the hdr and tmp pointers must be valid. For bcfs, they might be null + * + * Note that vcf parsing modifies the buffer (tokenize function) + */ + size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr); + /// Read next VCF or BCF record /** @param fp The file to read the record from @param h The header for the vcf/bcf file @@ -468,7 +515,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). */ HTSLIB_EXPORT bcf_hdr_t *vcf_hdr_read(htsFile *fp) HTS_RESULT_USED; - + bcf_hdr_t *vcf_hdr_read_required_sample_line(htsFile *fp, const uint8_t is_sample_line_required); /// Write a VCF format header /** @param fp Output file @param h The header to write @@ -651,6 +698,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). /** The following functions are for internal use and should rarely be called directly */ HTSLIB_EXPORT int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt); + int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_length, + const uint8_t is_sample_line_required); /// Synchronize internal header structures /** @param h Header @@ -988,6 +1037,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). return bcf_update_info(hdr, line, key, values, n, BCF_HT_LONG); } + void bcf_set_end_point_from_info(const bcf_hdr_t* hdr, bcf1_t* line); + /* * bcf_update_format_*() - functions for updating FORMAT fields * @values: pointer to the array of values, the same number of elements @@ -1248,6 +1299,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && (int_id)<(hdr)->n[BCF_DT_ID] && (hdr)->id[BCF_DT_ID][int_id].val && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf) #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) + uint64_t bcf_hdr_id2contig_length(const bcf_hdr_t* hdr, const int id); /// Convert BCF FORMAT data to string form /** * @param s kstring to write into @@ -1520,7 +1572,7 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) return e == 0 ? 0 : -1; } -static inline int bcf_enc_size(kstring_t *s, int size, int type) +static inline int bcf_enc_size(kstring_t *s, size_t size, int type) { // Most common case is first if (size < 15) { @@ -1546,11 +1598,18 @@ static inline int bcf_enc_size(kstring_t *s, int size, int type) *p++ = 1<<4|BCF_BT_INT16; i16_to_le(size, p); s->l += 4; - } else { + } + else if(size <= INT32_MAX){ *p++ = 1<<4|BCF_BT_INT32; i32_to_le(size, p); s->l += 6; } + else{ + *p++ = 1<<4|BCF_BT_INT64; + i64_to_le(size,p); + s->l += 10; + return -1; + } } return 0; } @@ -1559,7 +1618,8 @@ static inline int bcf_enc_inttype(long x) { if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) return BCF_BT_INT8; if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) return BCF_BT_INT16; - return BCF_BT_INT32; + if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) return BCF_BT_INT32; + return BCF_BT_INT64; } static inline int bcf_enc_int1(kstring_t *s, int32_t x) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index a43ab15ae..acb208488 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -420,8 +420,12 @@ static void bcf_sr_destroy1(bcf_sr_t *reader) free(reader->fname); if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); - bcf_hdr_destroy(reader->header); - hts_close(reader->file); + if (reader->header) { + bcf_hdr_destroy(reader->header); + } + if (reader->file) { + hts_close(reader->file); + } if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; jmbuffer; j++) @@ -693,7 +697,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->rid != reader->buffer[1]->rid ) break; - if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full + if ( reader->read_one_record_only || reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { diff --git a/vcf.c b/vcf.c index c126f7354..e6275ddad 100644 --- a/vcf.c +++ b/vcf.c @@ -1195,6 +1195,11 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) // Parse the whole header do { while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { + if(len < 0) + { + done = -1; + break; + } if (bcf_hdr_add_hrec(hdr, hrec) < 0) { bcf_hrec_destroy(hrec); return -1; @@ -1212,7 +1217,11 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) // of the next one). p += len; continue; - } + } + if(done < 0) + break; + + // Next should be the sample line. If not, it was a malformed // header, in which case print a warning and skip (many VCF @@ -2140,6 +2149,7 @@ static int bcf1_sync(bcf1_t *line) return 0; } + bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) { bcf1_sync(src); @@ -2609,6 +2619,116 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } #endif +int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) +{ + int64_t max = INT64_MIN, min = INT64_MAX; + int i; + if (n <= 0) { + return bcf_enc_size(s, 0, BCF_BT_NULL); + } else if (n == 1) { + return bcf_enc_long1(s, a[0]); + } else { + if (wsize <= 0) wsize = n; + + // Equivalent to: + // for (i = 0; i < n; ++i) { + // if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) + // continue; + // if (max < a[i]) max = a[i]; + // if (min > a[i]) min = a[i]; + // } + int64_t max4[4] = {INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN}; + int64_t min4[4] = {INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX}; + for (i = 0; i < (n&~3); i+=4) { + // bcf_int32_missing == INT32_MIN and + // bcf_int32_vector_end == INT32_MIN+1. + // We skip these, but can mostly avoid explicit checking + if (max4[0] < a[i+0]) max4[0] = a[i+0]; + if (max4[1] < a[i+1]) max4[1] = a[i+1]; + if (max4[2] < a[i+2]) max4[2] = a[i+2]; + if (max4[3] < a[i+3]) max4[3] = a[i+3]; + if (min4[0] > a[i+0] && a[i+0] > INT64_MIN+1) min4[0] = a[i+0]; + if (min4[1] > a[i+1] && a[i+1] > INT64_MIN+1) min4[1] = a[i+1]; + if (min4[2] > a[i+2] && a[i+2] > INT64_MIN+1) min4[2] = a[i+2]; + if (min4[3] > a[i+3] && a[i+3] > INT64_MIN+1) min4[3] = a[i+3]; + } + min = min4[0]; + if (min > min4[1]) min = min4[1]; + if (min > min4[2]) min = min4[2]; + if (min > min4[3]) min = min4[3]; + max = max4[0]; + if (max < max4[1]) max = max4[1]; + if (max < max4[2]) max = max4[2]; + if (max < max4[3]) max = max4[3]; + for (; i < n; ++i) { + if (max < a[i]) max = a[i]; + if (min > a[i] && a[i] > INT64_MIN+1) min = a[i]; + } + + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { + if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || + ks_resize(s, s->l + n) < 0) + return -1; + uint8_t *p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int64_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int64_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + s->l += n; + } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { + uint8_t *p; + if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || + ks_resize(s, s->l + n * sizeof(int16_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) + { + int16_t x; + if ( a[i]==bcf_int64_vector_end ) x = bcf_int16_vector_end; + else if ( a[i]==bcf_int64_missing ) x = bcf_int16_missing; + else x = a[i]; + i16_to_le(x, p); + p += sizeof(int16_t); + } + s->l += n * sizeof(int16_t); + } else if(max <= BCF_MAX_BT_INT32 && min >= BCF_MIN_BT_INT32){ + uint8_t *p; + if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || + ks_resize(s, s->l + n * sizeof(int32_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + int32_t x; + if ( a[i]==bcf_int64_vector_end ) x = bcf_int32_vector_end; + else if ( a[i]==bcf_int64_missing ) x = bcf_int32_missing; + else x = a[i]; + i32_to_le(x, p); + p += sizeof(int32_t); + } + s->l += n * sizeof(int32_t); + } + #ifdef VCF_ALLOW_INT64 + else { + uint8_t *p; + if(bcf_enc_size(s, wsize, BCF_BT_INT64) < 0 || ks_resize(s, s->l + n * sizeof(int64_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + int64_t x = a[i]; + i64_to_le(x, p); + p += sizeof(int64_t); + } + s->l += n * sizeof(int64_t); + } +#else + return -1; +#endif + } + + return 0; +} + static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { uint8_t *p; size_t i; @@ -2675,6 +2795,7 @@ int bcf_fmt_array(kstring_t *s, int n, int type, void *data) case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, v==bcf_int8_missing, v==bcf_int8_vector_end, kputw(v, s)); break; case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break; case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break; + case BCF_BT_INT64: BRANCH(int64_t, le_to_i64, v==bcf_int64_missing, v==bcf_int64_vector_end, kputll(v, s)); break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break; default: hts_log_error("Unexpected type %d", type); exit(1); break; } @@ -3348,8 +3469,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p char *r, *key; khint_t k; vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; - int32_t *a_val = NULL; - + int64_t *a_val = NULL; v->n_info = 0; if (*(q-1) == ';') *(q-1) = 0; for (r = key = p;; ++r) { @@ -3405,7 +3525,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (*t == ',') ++n_val; // Check both int and float size in one step for simplicity if (n_val > max_n_val) { - int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val)); + int64_t *a_tmp = (int64_t *)realloc(a_val, n_val * sizeof(*a_val)); if (!a_tmp) { hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // No appropriate code? @@ -3414,67 +3534,35 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p a_val = a_tmp; max_n_val = n_val; } - if ((y>>4&0xf) == BCF_HT_INT) { + if (((y >> 4 & 0xf) == BCF_HT_INT) || + ((y >> 4 & 0xf) == BCF_HT_LONG)) { i = 0, t = val; int64_t val1; - int is_int64 = 0; -#ifdef VCF_ALLOW_INT64 - if ( n_val==1 ) - { + for (; i < n_val; ++i, ++t) { overflow = 0; - long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); - if ( te==val ) tmp_val = bcf_int32_missing; - else if ( overflow || tmp_valBCF_MAX_BT_INT64 ) - { - if ( !extreme_int_warned ) - { - hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); + long long int tmp_val = hts_str2int( + t, &te, sizeof(tmp_val) * CHAR_BIT, &overflow); + if (te == t) + tmp_val = bcf_int64_missing; + else if (overflow || tmp_val < BCF_MIN_BT_INT64 || + tmp_val > BCF_MAX_BT_INT64) { + if (!extreme_int_warned) { + hts_log_warning( + "Extreme INFO/%s value encountered and set to " + "missing at %s:%" PRIhts_pos, + key, bcf_seqname_safe(h, v), v->pos + 1); extreme_int_warned = 1; } - tmp_val = bcf_int32_missing; - } - else - is_int64 = 1; - val1 = tmp_val; - t = te; - i = 1; // this is just to avoid adding another nested block... - } -#endif - for (; i < n_val; ++i, ++t) - { - overflow = 0; - long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); - if ( te==t ) tmp_val = bcf_int32_missing; - else if ( overflow || tmp_valBCF_MAX_BT_INT32 ) - { - if ( !extreme_int_warned ) - { - hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); - extreme_int_warned = 1; - } - tmp_val = bcf_int32_missing; + tmp_val = bcf_int64_missing; } a_val[i] = tmp_val; - for (t = te; *t && *t != ','; t++); + for (t = te; *t && *t != ','; t++) + ; } - if (n_val == 1) { -#ifdef VCF_ALLOW_INT64 - if ( is_int64 ) - { - v->unpacked |= BCF_IS_64BIT; - bcf_enc_long1(str, val1); - } - else - bcf_enc_int1(str, (int32_t)val1); -#else - val1 = a_val[0]; - bcf_enc_int1(str, (int32_t)val1); -#endif - } else { - bcf_enc_vint(str, n_val, a_val, -1); - } - if (n_val==1 && (val1!=bcf_int32_missing || is_int64) - && memcmp(key, "END", 4) == 0) + v->unpacked |= BCF_IS_64BIT; + bcf_enc_vlong(str, n_val, a_val, -1); + val1 = a_val[0]; + if (n_val==1 && val1!=bcf_int64_missing && memcmp(key, "END", 4) == 0)//memcmp instead of strcmp { if ( val1 <= v->pos ) { @@ -3533,22 +3621,6 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) // Assumed in lots of places, but we may as well spot this early assert(sizeof(float) == sizeof(int32_t)); - // Ensure string we parse has space to permit some over-flow when during - // parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over - // the more straight forward looking strcmp, giving a speed advantage. - if (ks_resize(s, s->l+4) < 0) - return -1; - - // Force our memory to be initialised so we avoid the technicality of - // undefined behaviour in using a 4-byte memcmp. (The reality is this - // almost certainly is never detected by the compiler so has no impact, - // but equally so this code has minimal (often beneficial) impact on - // performance too.) - s->s[s->l+0] = 0; - s->s[s->l+1] = 0; - s->s[s->l+2] = 0; - s->s[s->l+3] = 0; - bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t)); @@ -4353,6 +4425,434 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) } return dst; } +typedef union { + uint32_t i; + float f; +} if_pair; + +bcf_hdr_t *vcf_hdr_read_required_sample_line(htsFile *fp, const uint8_t is_sample_line_required) +{ + kstring_t txt, *s = &fp->line; + int ret; + bcf_hdr_t *h; + tbx_t *idx = NULL; + const char **names = NULL; + h = bcf_hdr_init("r"); + if (!h) { + hts_log_error("Failed to allocate bcf header"); + return NULL; + } + txt.l = txt.m = 0; txt.s = 0; + while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) { + int e = 0; + if (s->l == 0) continue; + if (s->s[0] != '#') { + hts_log_error("No sample line"); + goto error; + } + if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here + kstring_t tmp = { 0, 0, NULL }; + hFILE *f = hopen(fp->fn_aux, "r"); + if (f == NULL) { + hts_log_error("Couldn't open \"%s\"", fp->fn_aux); + goto error; + } + while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) { + char *tab = strchr(tmp.s, '\t'); + if (tab == NULL) continue; + e |= (kputs("##contig=\n", 2, &txt) < 0); + } + free(tmp.s); + if (hclose(f) != 0) { + hts_log_error("Error on closing %s", fp->fn_aux); + goto error; + } + if (e) goto error; + } + if (kputsn(s->s, s->l, &txt) < 0) goto error; + if (kputc('\n', &txt) < 0) goto error; + if (s->s[1] != '#') break; + } + if ( ret < -1 ) goto error; + if ( !txt.s ) + { + hts_log_error("Could not read the header"); + goto error; + } + size_t hdr_length = 0ull; + if ( bcf_hdr_parse_required_sample_line(h, txt.s, &hdr_length, is_sample_line_required) < 0 ) goto error; + + // check tabix index, are all contigs listed in the header? add the missing ones + idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL); + if ( idx ) + { + int i, n, need_sync = 0; + names = tbx_seqnames(idx, &n); + if (!names) goto error; + for (i=0; ierrcode ) + { + // vcf_parse1() encountered a new contig or tag, undeclared in the + // header. At this point, the header must have been printed, + // proceeding would lead to a broken BCF file. Errors must be checked + // and cleared by the caller before we can proceed. + hts_log_error("Unchecked error (%d)", v->errcode); + return -1; + } + bcf1_sync(v); // check if the BCF record was modified + if(is_bcf) + { + if((offset+8*sizeof(int)+v->shared.l+v->indiv.l) <= capacity) + { + //First 8 integers represent various lengths + if_pair* x = (if_pair*)(buffer+offset); + x[0].i = v->shared.l + 24; // to include six 32-bit integers + x[1].i = v->indiv.l; + x[2].i = v->rid; + x[3].i = v->pos; + x[4].i = v->rlen; + x[5].f = v->qual; + x[6].i = (uint32_t)v->n_allele<<16 | v->n_info; + x[7].i = (uint32_t)v->n_fmt<<24 | v->n_sample; + offset += 8*sizeof(int); + memcpy(buffer+offset, v->shared.s, v->shared.l); + offset += v->shared.l; + memcpy(buffer+offset, v->indiv.s, v->indiv.l); + offset += v->indiv.l; + } + } + else + { + tmp->l = 0; + int status = vcf_format(hdr, v, tmp); + assert(status == 0); + if((offset+tmp->l) <= capacity) + { + memcpy(buffer+offset, tmp->s, tmp->l); + offset += tmp->l; + } + } + return offset; +} + +bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_sample_line_required) +{ + if (hfp->format.format == vcf) + return vcf_hdr_read_required_sample_line(hfp, is_sample_line_required); + if (hfp->format.format != bcf) { + hts_log_error("Input is not detected as bcf or vcf format"); + return NULL; + } + + assert(hfp->is_bgzf); + + BGZF *fp = hfp->fp.bgzf; + uint8_t magic[5]; + bcf_hdr_t *h; + h = bcf_hdr_init("r"); + if (!h) { + hts_log_error("Failed to allocate bcf header"); + return NULL; + } + if (bgzf_read(fp, magic, 5) != 5) + { + hts_log_error("Failed to read the header (reading BCF in text mode?)"); + bcf_hdr_destroy(h); + return NULL; + } + if (strncmp((char*)magic, "BCF\2\2", 5) != 0) + { + if (!strncmp((char*)magic, "BCF", 3)) + hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported"); + else + hts_log_error("Invalid BCF2 magic string"); + bcf_hdr_destroy(h); + return NULL; + } + uint8_t buf[4]; + size_t hlen; + char *htxt = NULL; + if (bgzf_read(fp, buf, 4) != 4) goto fail; + hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); + if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (hlen > FUZZ_ALLOC_LIMIT) { errno = ENOMEM; goto fail; } +#endif + htxt = (char*)malloc(hlen + 1); + if (!htxt) goto fail; + if (bgzf_read(fp, htxt, hlen) != hlen) goto fail; + htxt[hlen] = '\0'; // Ensure htxt is terminated + size_t hdr_length = 0ull; + bcf_hdr_parse_required_sample_line(h, htxt, &hdr_length, is_sample_line_required); // FIXME: Does this return anything meaningful? + free(htxt); + return h; + fail: + hts_log_error("Failed to read BCF header"); + free(htxt); + bcf_hdr_destroy(h); + return NULL; +} + + +int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_length, + const uint8_t is_sample_line_required) +{ + int len, done = 0; + char *p = htxt; + int return_val = 0; + + // Check sanity: "fileformat" string must come as first + bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); + if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") ) + hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?"); + + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + + // The filter PASS must appear first in the dictionary + hrec = bcf_hdr_parse_line(hdr,"##FILTER=",&len); + if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + + // Parse the whole header + do { + while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { + if(len < 0) + { + return_val = -1; + done = -1; + break; + } + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + p += len; + } + assert(hrec == NULL); + if (len < 0) { + // len < 0 indicates out-of-memory, or similar error + hts_log_error("Could not parse header line: %s", strerror(errno)); + return -1; + } else if (len > 0) { + // Bad header line. bcf_hdr_parse_line() will have logged it. + // Skip and try again on the next line (p + len will be the start + // of the next one). + p += len; + continue; + } + + if(done < 0) + break; + + // Next should be the sample line. If not, it was a malformed + // header, in which case print a warning and skip (many VCF + // operations do not really care about a few malformed lines). + // In the future we may want to add a strict mode that errors in + // this case. + if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) { + char *eol = strchr(p, '\n'); + if (*p != '\0') { + char buffer[320]; + hts_log_warning("Could not parse header line: %s", + hts_strprint(buffer, sizeof(buffer), + '"', p, + eol ? (eol - p) : SIZE_MAX)); + } + if (eol) { + p = eol + 1; // Try from the next line. + } else { + done = -1; // No more lines left, give up. + } + } else { + done = 1; // Sample line found + } + } while (!done); + + size_t sample_line_length = 0; + if (done < 0) { + if(is_sample_line_required) + { + // No sample line is fatal. + hts_log_error("Could not parse the header, sample line not found"); + return -1; + } + } + else + { + if(return_val >= 0) + return_val = bcf_hdr_parse_sample_line(hdr,p); + } + (*hdr_length) = ((size_t)(p - htxt)) + sample_line_length; + if(return_val >= 0) + return_val = bcf_hdr_sync(hdr); + if(return_val >= 0) + bcf_hdr_check_sanity(hdr); + return return_val; +} + + + + + +size_t bcf_hdr_serialize(bcf_hdr_t* h, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const uint8_t keep_idx_fields) +{ + if (!h) { + errno = EINVAL; + return offset; + } + if ( h->dirty ) { + if (bcf_hdr_sync(h) < 0) return offset; + } + + kstring_t htxt = {0,0,0}; + bcf_hdr_format(h, (is_bcf & keep_idx_fields), &htxt); + uint32_t hlen = htxt.l; + if(is_bcf) + { + kputc('\0', &htxt); // include the \0 byte + ++hlen; + + if((offset+5+sizeof(int)+hlen) <= capacity) + { + if(!keep_idx_fields) //htsjdk cannot deal with 2.2 header + memcpy(buffer+offset, "BCF\2\1", 5); + else + memcpy(buffer+offset, "BCF\2\2", 5); + offset += 5; + memcpy(buffer+offset, &hlen, sizeof(int)); + offset += sizeof(int); + memcpy(buffer+offset, htxt.s, hlen); + offset += hlen; + } + } + else + { + if(offset+hlen <= capacity) + { + memcpy(buffer+offset, htxt.s, hlen); + offset += hlen; + } + } + free(htxt.s); + return offset; +} + +size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf) +{ + size_t hdr_length = 0ull; + size_t curr_offset = offset; + if(is_bcf) + { + //magic string + hdr length + if(curr_offset+BCF_HEADER_MAGIC_STRING_LENGTH+sizeof(int) > capacity) + return offset; + const char* buffer_magic_string = (const char*)(buffer+curr_offset); + if(strncmp(buffer_magic_string, BCF_V_2_2_HEADER_MAGIC_STRING, BCF_HEADER_MAGIC_STRING_LENGTH) != 0 + && strncmp(buffer_magic_string, BCF_V_2_1_HEADER_MAGIC_STRING, BCF_HEADER_MAGIC_STRING_LENGTH) != 0) + { + fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 and BCFv2.1 are supported.\n", __FILE__,__LINE__,__FUNCTION__); + return offset; + } + curr_offset += BCF_HEADER_MAGIC_STRING_LENGTH; + //Header length + memcpy(&hdr_length, buffer+curr_offset, sizeof(int)); + curr_offset += sizeof(int); + if(curr_offset+hdr_length > capacity) + return offset; + } + return bcf_hdr_parse(h, (char*)(buffer+curr_offset)); +} + +size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr) +{ + if(is_bcf) + { + bcf_clear(v); + size_t curr_offset = offset; + if(curr_offset+8*sizeof(uint32_t) >= capacity) + return offset; + const if_pair* x = (if_pair*)(buffer+curr_offset); + size_t shared_length = x[0].i-6*sizeof(int); + size_t indiv_length = x[1].i; + if(curr_offset+8*sizeof(uint32_t)+shared_length+indiv_length > capacity) + return offset; + ks_resize(&v->shared, shared_length); + ks_resize(&v->indiv, indiv_length); + v->rid = x[2].i; + v->pos = x[3].i; + v->rlen = x[4].i; + v->qual = x[5].f; + v->n_allele = (x[6].i)>>16; v->n_info = (x[6].i)&0xffff; + v->n_fmt = (x[7].i)>>24; v->n_sample = (x[7].i)&0xffffff; + v->shared.l = shared_length, v->indiv.l = indiv_length; + // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 + if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; + curr_offset += 8*sizeof(uint32_t); + + memcpy(v->shared.s, buffer+curr_offset, shared_length); + curr_offset += shared_length; + + memcpy(v->indiv.s, buffer+curr_offset, indiv_length); + curr_offset += indiv_length; + return curr_offset; + } + else + { + kstring_t tmp; + assert(offset < capacity); + tmp.s = (char*)(buffer+offset); + size_t max_length = capacity-offset; + size_t line_length = max_length; + //See if newline exists + char* line_end_ptr = (char*)(memchr(tmp.s, '\n', max_length)); + if(line_end_ptr) + { + line_length = ((size_t)(line_end_ptr - tmp.s)); + *line_end_ptr = 0; //replace '\n' with null byte, vcf_parse doesn't like '\n' + } + tmp.l = line_length; + tmp.m = max_length; + int status = vcf_parse(&tmp, hdr, v); + //vcf parsed succesfully + if(status == 0) + return offset + line_length + (line_end_ptr ? 1u : 0u); //for the \n character + else + return offset; + } +} int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) { @@ -4721,6 +5221,7 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t if ( alt[0]=='<' ) { if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant + if( strncmp(alt, "", 9) == 0) { var->n = 0; var->type = VCF_NON_REF; return; } if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; } var->type = VCF_OTHER; @@ -4806,7 +5307,7 @@ static int bcf_set_variant_types(bcf1_t *b) // to be compatible with callers that are not expecting newer values // like VCF_INS, VCF_DEL. The full set is available from the newer // vcf_has_variant_type* interfaces. -#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP) +#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_NON_REF) int bcf_get_variant_types(bcf1_t *rec) { if ( rec->d.var_type==-1 ) { @@ -4946,11 +5447,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v #ifdef VCF_ALLOW_INT64 else if ( type==BCF_HT_LONG ) { - if (n != 1) { - hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1); - abort(); - } - bcf_enc_long1(&str, *(int64_t *) values); + bcf_enc_vlong(&str, n, (const int64_t*)values, -1); } #endif else @@ -5404,7 +5901,12 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi { int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header - if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type + if((type & 0xff) == BCF_HT_LONG) { + const int ht_type_in_hdr = bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id); + if(ht_type_in_hdr != BCF_HT_INT && ht_type_in_hdr != BCF_HT_LONG) return -2; // expected different type + } + else + if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); @@ -5477,6 +5979,14 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi } else { BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; } + case BCF_BT_INT64: + if (type == BCF_HT_LONG) { + BRANCH(int64_t, le_to_i64, p==bcf_int64_missing, p==bcf_int64_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + hts_log_error("Trying to get 32-bit int data from a field which contains 64 bit values"); + return -2; + } + break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break; default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2; } @@ -5586,6 +6096,7 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; + case BCF_BT_INT64: BRANCH(int64_t, le_to_i64, p==bcf_int64_missing, p==bcf_int64_vector_end, *tmp=bcf_int64_missing, *tmp=bcf_int64_vector_end, *tmp=p, int64_t); break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break; default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1); } @@ -5664,3 +6175,23 @@ const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { return buffer; } +uint64_t bcf_hdr_id2contig_length(const bcf_hdr_t* hdr, const int id) +{ + bcf_hrec_t* hrec = bcf_hdr_id2hrec(hdr, BCF_DT_CTG, 0, id); + int i = 0; + for(i=0;inkeys;++i) + if(strcmp(hrec->keys[i], "length") == 0) + return strtoull(hrec->vals[i], 0, 10); + return 0; +} + +void bcf_set_end_point_from_info(const bcf_hdr_t* hdr, bcf1_t* line) +{ + bcf_unpack(line, BCF_UN_INFO); + bcf_info_t* info = bcf_get_info(hdr, line, "END"); + if(info) + line->m_end_point = info->v1.i - 1; //END value is 1 based, line->pos is 0 based, change to 0 based + else //no END tag, end is same as pos if not deletion, else depends on rlen + line->m_end_point = line->pos + line->rlen - 1; +} +