From 8ab6dfc2d8b4ec34f1996a76d536fe6883514c66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 19 Apr 2020 19:17:38 -0400 Subject: [PATCH 01/11] Refactoring --- src/Makefile | 4 +- src/bwa_utils.c | 2 +- src/contig_node_translator.c | 52 --------- src/contig_translator.c | 52 +++++++++ ..._node_translator.h => contig_translator.h} | 10 +- src/prophex_query.c | 104 +++++++++--------- src/prophex_query.h | 8 +- 7 files changed, 116 insertions(+), 116 deletions(-) delete mode 100644 src/contig_node_translator.c create mode 100644 src/contig_translator.c rename src/{contig_node_translator.h => contig_translator.h} (60%) diff --git a/src/Makefile b/src/Makefile index 21c6b04..bb1326f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -45,8 +45,8 @@ clean: # if BWA Makefile is present test -f bwa/Makefile && $(MAKE) -C bwa clean -$(PROG): bwa/libbwa.a $(AOBJS2) main.o prophex_query.o prophex_build.o klcp.o bitarray.o bwa_utils.o prophex_utils.o contig_node_translator.o - $(CC) $(INCLUDES) $(CFLAGS) $(DFLAGS) $(AOBJS2) main.o prophex_query.o prophex_build.o klcp.o bitarray.o bwa_utils.o prophex_utils.o contig_node_translator.o -o $@ -Lbwa -lbwa $(LIBS) +$(PROG): bwa/libbwa.a $(AOBJS2) main.o prophex_query.o prophex_build.o klcp.o bitarray.o bwa_utils.o prophex_utils.o contig_translator.o + $(CC) $(INCLUDES) $(CFLAGS) $(DFLAGS) $(AOBJS2) main.o prophex_query.o prophex_build.o klcp.o bitarray.o bwa_utils.o prophex_utils.o contig_translator.o -o $@ -Lbwa -lbwa $(LIBS) #bwa/libbwa.a $(AOBJS2) bwtexk.o: bwa/libbwa.a: diff --git a/src/bwa_utils.c b/src/bwa_utils.c index f129f0c..2c85543 100644 --- a/src/bwa_utils.c +++ b/src/bwa_utils.c @@ -4,7 +4,7 @@ #include #include #include "bwa.h" -#include "contig_node_translator.h" +#include "contig_translator.h" #include "khash.h" #include "kstring.h" #include "prophex_utils.h" diff --git a/src/contig_node_translator.c b/src/contig_node_translator.c deleted file mode 100644 index 51fcbd0..0000000 --- a/src/contig_node_translator.c +++ /dev/null @@ -1,52 +0,0 @@ -#include "contig_node_translator.h" -#include -#include -#include -#include "utils.h" - -#define MAX_NODES_COUNT 100000 -#define MAX_CONTIGS_COUNT 200000000 - -static int contig_to_node[MAX_CONTIGS_COUNT]; -static char* node_names[MAX_NODES_COUNT]; -static int node_name_lengths[MAX_NODES_COUNT]; -static int nodes_count = 0; -static int contigs_count = 0; - -int get_node_from_contig(int contig) { - if (contig < 0 || contig >= contigs_count) { - fprintf(stderr, "[prophex:%s] contig %d is outside of range [%d, %d]\n", __func__, contig, 0, contigs_count - 1); - } - return contig_to_node[contig]; -} - -char* get_node_name(int node) { return node_names[node]; } - -int get_node_name_length(int node) { return node_name_lengths[node]; } - -void add_contig(char* contig, int contig_number) { - xassert(contigs_count < MAX_CONTIGS_COUNT, - "[prophex] there are more than MAX_CONTIGS_COUNT contigs, try to increase MAX_CONTIGS_COUNT in contig_node_translator.c\n"); - contigs_count++; - const char* ch = strchr(contig, '@'); - int index = 0; - if (ch == NULL) { - index = strlen(contig); - } else { - index = ch - contig; - } - contig[index] = '\0'; - if (nodes_count == 0 || strcmp(contig, node_names[nodes_count - 1])) { - char* node_name = malloc((index + 1) * sizeof(char)); - memcpy(node_name, contig, index); - node_name[index] = '\0'; - xassert(nodes_count < MAX_NODES_COUNT, - "[prophex] there are more than MAX_NODES_COUNT nodes, try to increase MAX_NODES_COUNT in contig_node_translator.c\n"); - node_names[nodes_count] = node_name; - node_name_lengths[nodes_count] = strlen(node_name); - contig_to_node[contig_number] = nodes_count; - nodes_count++; - } else { - contig_to_node[contig_number] = nodes_count - 1; - } -} diff --git a/src/contig_translator.c b/src/contig_translator.c new file mode 100644 index 0000000..de91e0f --- /dev/null +++ b/src/contig_translator.c @@ -0,0 +1,52 @@ +#include "contig_translator.h" +#include +#include +#include +#include "utils.h" + +#define MAX_kmersetS_COUNT 100000 +#define MAX_CONTIGS_COUNT 200000000 + +static int contig_to_kmerset[MAX_CONTIGS_COUNT]; +static char* kmerset_names[MAX_kmersetS_COUNT]; +static int kmerset_name_lengths[MAX_kmersetS_COUNT]; +static int kmersets_count = 0; +static int contigs_count = 0; + +int get_kmerset_from_contig(int contig) { + if (contig < 0 || contig >= contigs_count) { + fprintf(stderr, "[prophex:%s] contig %d is outside of range [%d, %d]\n", __func__, contig, 0, contigs_count - 1); + } + return contig_to_kmerset[contig]; +} + +char* get_kmerset_name(int kmerset) { return kmerset_names[kmerset]; } + +int get_kmerset_name_length(int kmerset) { return kmerset_name_lengths[kmerset]; } + +void add_contig(char* contig, int contig_number) { + xassert(contigs_count < MAX_CONTIGS_COUNT, + "[prophex] there are more than MAX_CONTIGS_COUNT contigs, try to increase MAX_CONTIGS_COUNT in contig_kmerset_translator.c\n"); + contigs_count++; + const char* ch = strchr(contig, '@'); + int index = 0; + if (ch == NULL) { + index = strlen(contig); + } else { + index = ch - contig; + } + contig[index] = '\0'; + if (kmersets_count == 0 || strcmp(contig, kmerset_names[kmersets_count - 1])) { + char* kmerset_name = malloc((index + 1) * sizeof(char)); + memcpy(kmerset_name, contig, index); + kmerset_name[index] = '\0'; + xassert(kmersets_count < MAX_kmersetS_COUNT, + "[prophex] there are more than MAX_kmersetS_COUNT kmersets, try to increase MAX_kmersetS_COUNT in contig_kmerset_translator.c\n"); + kmerset_names[kmersets_count] = kmerset_name; + kmerset_name_lengths[kmersets_count] = strlen(kmerset_name); + contig_to_kmerset[contig_number] = kmersets_count; + kmersets_count++; + } else { + contig_to_kmerset[contig_number] = kmersets_count - 1; + } +} diff --git a/src/contig_node_translator.h b/src/contig_translator.h similarity index 60% rename from src/contig_node_translator.h rename to src/contig_translator.h index abf0155..2b2fc6f 100644 --- a/src/contig_node_translator.h +++ b/src/contig_translator.h @@ -4,14 +4,14 @@ Licence: MIT */ -#ifndef CONTIG_NODE_TRANSLATOR_H -#define CONTIG_NODE_TRANSLATOR_H +#ifndef CONTIG_TRANSLATOR_H +#define CONTIG_TRANSLATOR_H #include -int get_node_from_contig(int contig); -char* get_node_name(int node); -int get_node_name_length(int node); +int get_kmerset_from_contig(int contig); +char* get_kmerset_name(int node); +int get_kmerset_name_length(int node); void add_contig(char* contig, int contig_number); #endif // CONTIG_NODE_TRANSLATOR_H diff --git a/src/prophex_query.c b/src/prophex_query.c index 56e0874..ad55e79 100644 --- a/src/prophex_query.c +++ b/src/prophex_query.c @@ -5,7 +5,7 @@ #include "bwa.h" #include "bwa_utils.h" #include "bwase.h" -#include "contig_node_translator.h" +#include "contig_translator.h" #include "klcp.h" #include "kseq.h" #include "kstring.h" @@ -94,9 +94,9 @@ void sort(int count, int** array) { } } -size_t get_nodes_from_positions(const bwaidx_t* idx, const int query_length, const int positions_cnt, bwt_position_t* positions, int32_t* seen_nodes, - int8_t** seen_nodes_marks, int skip_positions_on_border) { - size_t nodes_cnt = 0; +size_t get_kmersets_from_positions(const bwaidx_t* idx, const int query_length, const int positions_cnt, bwt_position_t* positions, int32_t* seen_kmersets, + int8_t** seen_kmersets_marks, int skip_positions_on_border) { + size_t kmersets_cnt = 0; int i; for (i = 0; i < positions_cnt; ++i) { uint64_t pos = positions[i].position; @@ -108,28 +108,28 @@ size_t get_nodes_from_positions(const bwaidx_t* idx, const int query_length, con rid = bns_pos2rid(idx->bns, pos); positions[i].rid = rid; } - int node = get_node_from_contig(rid); - positions[i].node = node; - int seen = (*seen_nodes_marks)[node]; - if (!seen && node != -1 && (!skip_positions_on_border || !is_position_on_border(idx, &(positions[i]), query_length))) { - seen_nodes[nodes_cnt] = node; - ++nodes_cnt; - (*seen_nodes_marks)[node] = 1; + int kmerset = get_kmerset_from_contig(rid); + positions[i].kmerset = kmerset; + int seen = (*seen_kmersets_marks)[kmerset]; + if (!seen && kmerset != -1 && (!skip_positions_on_border || !is_position_on_border(idx, &(positions[i]), query_length))) { + seen_kmersets[kmersets_cnt] = kmerset; + ++kmersets_cnt; + (*seen_kmersets_marks)[kmerset] = 1; } } int r; - for (r = 0; r < nodes_cnt; ++r) { - (*seen_nodes_marks)[seen_nodes[r]] = 0; + for (r = 0; r < kmersets_cnt; ++r) { + (*seen_kmersets_marks)[seen_kmersets[r]] = 0; } - sort(nodes_cnt, &seen_nodes); - return nodes_cnt; + sort(kmersets_cnt, &seen_kmersets); + return kmersets_cnt; } -void output_old(int* seen_nodes, const int nodes_cnt) { - fprintf(stdout, "%d ", nodes_cnt); +void output_old(int* seen_kmersets, const int kmersets_cnt) { + fprintf(stdout, "%d ", kmersets_cnt); int r; - for (r = 0; r < nodes_cnt; ++r) { - fprintf(stdout, "%s ", get_node_name(seen_nodes[r])); + for (r = 0; r < kmersets_cnt; ++r) { + fprintf(stdout, "%s ", get_kmerset_name(seen_kmersets[r])); } fprintf(stdout, "\n"); } @@ -146,7 +146,7 @@ void strncat_with_check(char* str, char* str_to_append, int* str_length, int str } } -void construct_streaks(char** all_streaks, char** current_streak, int* seen_nodes, int nodes_cnt, int streak_size, int is_ambiguous_streak, +void construct_streaks(char** all_streaks, char** current_streak, int* seen_kmersets, int kmersets_cnt, int streak_size, int is_ambiguous_streak, int* is_first_streak) { if (*is_first_streak) { *all_streaks[0] = '\0'; @@ -156,15 +156,15 @@ void construct_streaks(char** all_streaks, char** current_streak, int* seen_node if (is_ambiguous_streak) { strcat(*current_streak, "A:"); current_streak_approximate_length += 2; - } else if (nodes_cnt > 0) { + } else if (kmersets_cnt > 0) { int r; - for (r = 0; r < nodes_cnt - 1; ++r) { - strncat_with_check(*current_streak, get_node_name(seen_nodes[r]), ¤t_streak_approximate_length, get_node_name_length(seen_nodes[r]), + for (r = 0; r < kmersets_cnt - 1; ++r) { + strncat_with_check(*current_streak, get_kmerset_name(seen_kmersets[r]), ¤t_streak_approximate_length, get_kmerset_name_length(seen_kmersets[r]), MAX_SOFT_STREAK_LENGTH); strncat_with_check(*current_streak, ",", ¤t_streak_approximate_length, 1, MAX_SOFT_STREAK_LENGTH); } - strncat_with_check(*current_streak, get_node_name(seen_nodes[nodes_cnt - 1]), ¤t_streak_approximate_length, - get_node_name_length(seen_nodes[nodes_cnt - 1]), MAX_SOFT_STREAK_LENGTH); + strncat_with_check(*current_streak, get_kmerset_name(seen_kmersets[kmersets_cnt - 1]), ¤t_streak_approximate_length, + get_kmerset_name_length(seen_kmersets[kmersets_cnt - 1]), MAX_SOFT_STREAK_LENGTH); strncat_with_check(*current_streak, ":", ¤t_streak_approximate_length, 1, MAX_SOFT_STREAK_LENGTH); } else { strncat_with_check(*current_streak, "0:", ¤t_streak_approximate_length, 2, MAX_SOFT_STREAK_LENGTH); @@ -246,12 +246,12 @@ prophex_worker_t* prophex_worker_init(const bwaidx_t* idx, int32_t seqs_cnt, con prophex_worker->aux_data[tid].positions = malloc(MAX_POSSIBLE_SA_POSITIONS * sizeof(bwt_position_t)); prophex_worker->aux_data[tid].all_streaks = malloc(MAX_STREAK_LENGTH * sizeof(char)); prophex_worker->aux_data[tid].current_streak = malloc(MAX_STREAK_LENGTH * sizeof(char)); - prophex_worker->aux_data[tid].seen_nodes = malloc(MAX_POSSIBLE_SA_POSITIONS * sizeof(int32_t)); - prophex_worker->aux_data[tid].prev_seen_nodes = malloc(MAX_POSSIBLE_SA_POSITIONS * sizeof(int32_t)); - prophex_worker->aux_data[tid].seen_nodes_marks = malloc(idx->bns->n_seqs * sizeof(int8_t)); + prophex_worker->aux_data[tid].seen_kmersets = malloc(MAX_POSSIBLE_SA_POSITIONS * sizeof(int32_t)); + prophex_worker->aux_data[tid].prev_seen_kmersets = malloc(MAX_POSSIBLE_SA_POSITIONS * sizeof(int32_t)); + prophex_worker->aux_data[tid].seen_kmersets_marks = malloc(idx->bns->n_seqs * sizeof(int8_t)); int index; for (index = 0; index < idx->bns->n_seqs; ++index) { - prophex_worker->aux_data[tid].seen_nodes_marks[index] = 0; + prophex_worker->aux_data[tid].seen_kmersets_marks[index] = 0; } prophex_worker->aux_data[tid].rids_computations = 0; prophex_worker->aux_data[tid].using_prev_rids = 0; @@ -275,14 +275,14 @@ void prophex_aux_data_destroy(prophex_query_aux_t* prophex_query_aux_data) { if (prophex_query_aux_data->current_streak) { free(prophex_query_aux_data->current_streak); } - if (prophex_query_aux_data->seen_nodes) { - free(prophex_query_aux_data->seen_nodes); + if (prophex_query_aux_data->seen_kmersets) { + free(prophex_query_aux_data->seen_kmersets); } - if (prophex_query_aux_data->prev_seen_nodes) { - free(prophex_query_aux_data->prev_seen_nodes); + if (prophex_query_aux_data->prev_seen_kmersets) { + free(prophex_query_aux_data->prev_seen_kmersets); } - if (prophex_query_aux_data->seen_nodes_marks) { - free(prophex_query_aux_data->seen_nodes_marks); + if (prophex_query_aux_data->seen_kmersets_marks) { + free(prophex_query_aux_data->seen_kmersets_marks); } } @@ -319,9 +319,9 @@ void process_sequence(void* data, int seq_index, int tid) { prophex_query_aux_t aux_data = prophex_worker->aux_data[tid]; char* current_streak = aux_data.current_streak; char* all_streaks = aux_data.all_streaks; - int32_t* seen_nodes = aux_data.seen_nodes; - int32_t* prev_seen_nodes = aux_data.prev_seen_nodes; - int8_t* seen_nodes_marks = aux_data.seen_nodes_marks; + int32_t* seen_kmersets = aux_data.seen_kmersets; + int32_t* prev_seen_kmersets = aux_data.prev_seen_kmersets; + int8_t* seen_kmersets_marks = aux_data.seen_kmersets_marks; int i; for (i = 0; i < seq.l_seq; ++i) // convert to 2-bit encoding if we have not done so @@ -336,7 +336,7 @@ void process_sequence(void* data, int seq_index, int tid) { bwt_t* bwt = idx->bwt; uint64_t k = 0, l = 0, prev_k = 1, prev_l = 0; int current_streak_size = 0; - int prev_nodes_count = 0; + int prev_kmersets_count = 0; int start_pos = 0; size_t positions_cnt = 0; uint64_t decreased_k = 1; @@ -365,7 +365,7 @@ void process_sequence(void* data, int seq_index, int tid) { } if (end_pos - last_ambiguous_index < opt->kmer_length) { if (!is_ambiguous_streak) { - construct_streaks(&all_streaks, ¤t_streak, prev_seen_nodes, prev_nodes_count, current_streak_size, is_ambiguous_streak, + construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, is_ambiguous_streak, &is_first_streak); is_ambiguous_streak = 1; current_streak_size = 1; @@ -376,7 +376,7 @@ void process_sequence(void* data, int seq_index, int tid) { continue; } else { if (is_ambiguous_streak && current_streak_size > 0) { - construct_streaks(&all_streaks, ¤t_streak, prev_seen_nodes, prev_nodes_count, current_streak_size, is_ambiguous_streak, + construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, is_ambiguous_streak, &is_first_streak); is_ambiguous_streak = 0; current_streak_size = 0; @@ -387,7 +387,7 @@ void process_sequence(void* data, int seq_index, int tid) { l = 0; prev_k = 1; prev_l = 0; - prev_nodes_count = 0; + prev_kmersets_count = 0; ambiguous_streak_just_ended = 1; } else { ambiguous_streak_just_ended = 0; @@ -406,7 +406,7 @@ void process_sequence(void* data, int seq_index, int tid) { calculate_sa_interval_restart(bwt, opt->kmer_length, seq.seq, &k, &l, start_pos); } } - int nodes_cnt = 0; + int kmersets_cnt = 0; if (k <= l) { if (prev_l - prev_k == l - k && increased_l - decreased_k == l - k) { aux_data.using_prev_rids++; @@ -415,30 +415,30 @@ void process_sequence(void* data, int seq_index, int tid) { aux_data.rids_computations++; positions_cnt = get_positions(idx, aux_data.positions, opt->kmer_length, k, l); } - nodes_cnt = get_nodes_from_positions(idx, opt->kmer_length, positions_cnt, aux_data.positions, seen_nodes, &seen_nodes_marks, + kmersets_cnt = get_kmersets_from_positions(idx, opt->kmer_length, positions_cnt, aux_data.positions, seen_kmersets, &seen_kmersets_marks, opt->skip_positions_on_border); } if (opt->output_old) { - output_old(seen_nodes, nodes_cnt); + output_old(seen_kmersets, kmersets_cnt); } else if (opt->output) { - if (start_pos == 0 || ambiguous_streak_just_ended || (equal(nodes_cnt, seen_nodes, prev_nodes_count, prev_seen_nodes))) { + if (start_pos == 0 || ambiguous_streak_just_ended || (equal(kmersets_cnt, seen_kmersets, prev_kmersets_count, prev_seen_kmersets))) { current_streak_size++; } else { - construct_streaks(&all_streaks, ¤t_streak, prev_seen_nodes, prev_nodes_count, current_streak_size, is_ambiguous_streak, + construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, is_ambiguous_streak, &is_first_streak); current_streak_size = 1; } } - int* tmp = seen_nodes; - seen_nodes = prev_seen_nodes; - prev_seen_nodes = tmp; - prev_nodes_count = nodes_cnt; + int* tmp = seen_kmersets; + seen_kmersets = prev_seen_kmersets; + prev_seen_kmersets = tmp; + prev_kmersets_count = kmersets_cnt; prev_k = k; prev_l = l; start_pos++; } if (current_streak_size > 0) { - construct_streaks(&all_streaks, ¤t_streak, prev_seen_nodes, prev_nodes_count, current_streak_size, is_ambiguous_streak, + construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, is_ambiguous_streak, &is_first_streak); } if (opt->output) { diff --git a/src/prophex_query.h b/src/prophex_query.h index f1c3009..e9b99c5 100644 --- a/src/prophex_query.h +++ b/src/prophex_query.h @@ -18,16 +18,16 @@ typedef struct { uint64_t position; int strand; int rid; - int node; + int kmerset; } bwt_position_t; typedef struct { bwt_position_t* positions; char* all_streaks; char* current_streak; - int32_t* seen_nodes; - int32_t* prev_seen_nodes; - int8_t* seen_nodes_marks; + int32_t* seen_kmersets; + int32_t* prev_seen_kmersets; + int8_t* seen_kmersets_marks; int rids_computations; int using_prev_rids; } prophex_query_aux_t; From 1ed0ac3bbcaf393a1c6e4813c0f8db06c0409c2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 19 Apr 2020 19:21:35 -0400 Subject: [PATCH 02/11] Finish refactoring --- src/contig_translator.c | 12 ++++++------ src/contig_translator.h | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/contig_translator.c b/src/contig_translator.c index de91e0f..191f733 100644 --- a/src/contig_translator.c +++ b/src/contig_translator.c @@ -4,12 +4,12 @@ #include #include "utils.h" -#define MAX_kmersetS_COUNT 100000 +#define MAX_KMERSETS_COUNT 100000 #define MAX_CONTIGS_COUNT 200000000 static int contig_to_kmerset[MAX_CONTIGS_COUNT]; -static char* kmerset_names[MAX_kmersetS_COUNT]; -static int kmerset_name_lengths[MAX_kmersetS_COUNT]; +static char* kmerset_names[MAX_KMERSETS_COUNT]; +static int kmerset_name_lengths[MAX_KMERSETS_COUNT]; static int kmersets_count = 0; static int contigs_count = 0; @@ -26,7 +26,7 @@ int get_kmerset_name_length(int kmerset) { return kmerset_name_lengths[kmerset]; void add_contig(char* contig, int contig_number) { xassert(contigs_count < MAX_CONTIGS_COUNT, - "[prophex] there are more than MAX_CONTIGS_COUNT contigs, try to increase MAX_CONTIGS_COUNT in contig_kmerset_translator.c\n"); + "[prophex] there are more than MAX_CONTIGS_COUNT contigs, try to increase MAX_CONTIGS_COUNT in contig_translator.c\n"); contigs_count++; const char* ch = strchr(contig, '@'); int index = 0; @@ -40,8 +40,8 @@ void add_contig(char* contig, int contig_number) { char* kmerset_name = malloc((index + 1) * sizeof(char)); memcpy(kmerset_name, contig, index); kmerset_name[index] = '\0'; - xassert(kmersets_count < MAX_kmersetS_COUNT, - "[prophex] there are more than MAX_kmersetS_COUNT kmersets, try to increase MAX_kmersetS_COUNT in contig_kmerset_translator.c\n"); + xassert(kmersets_count < MAX_KMERSETS_COUNT, + "[prophex] there are more than MAX_KMERSETS_COUNT kmersets, try to increase MAX_KMERSETS_COUNT in contig_translator.c\n"); kmerset_names[kmersets_count] = kmerset_name; kmerset_name_lengths[kmersets_count] = strlen(kmerset_name); contig_to_kmerset[contig_number] = kmersets_count; diff --git a/src/contig_translator.h b/src/contig_translator.h index 2b2fc6f..c28f086 100644 --- a/src/contig_translator.h +++ b/src/contig_translator.h @@ -1,5 +1,5 @@ /* - Correspondance between contig_id in BWA and node_name in taxonomic tree. + Correspondance between contig_id in BWA and kmerset_name in taxonomic tree. Author: Kamil Salikhov Licence: MIT */ @@ -10,8 +10,8 @@ #include int get_kmerset_from_contig(int contig); -char* get_kmerset_name(int node); -int get_kmerset_name_length(int node); +char* get_kmerset_name(int kmerset); +int get_kmerset_name_length(int kmerset); void add_contig(char* contig, int contig_number); -#endif // CONTIG_NODE_TRANSLATOR_H +#endif // CONTIG_TRANSLATOR_H From fdb79293ba340e01dbcd5a60b64f5e38693aa4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 19 Apr 2020 19:22:58 -0400 Subject: [PATCH 03/11] Update version and contact --- src/main.c | 2 +- src/version.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.c b/src/main.c index 95999bc..a5fbe46 100644 --- a/src/main.c +++ b/src/main.c @@ -27,7 +27,7 @@ static int usage() { fprintf(stderr, "Program: prophex (a lossless k-mer index)\n"); fprintf(stderr, "Version: %s\n", VERSION); fprintf(stderr, "Authors: Kamil Salikhov, Karel Brinda, Simone Pignotti, Gregory Kucherov\n"); - fprintf(stderr, "Contact: kamil.salikhov@univ-mlv.fr\n"); + fprintf(stderr, "Contact: kamil.salikhov@univ-mlv.fr, karel.brinda@gmail.com\n"); fprintf(stderr, "\n"); fprintf(stderr, "Usage: prophex [options]\n"); fprintf(stderr, "\n"); diff --git a/src/version.h b/src/version.h index 2072b7f..a7cc638 100644 --- a/src/version.h +++ b/src/version.h @@ -1 +1 @@ -#define VERSION "0.1.1" +#define VERSION "0.2.0" From e8e93c80c8b1d3d955f51be44d3db2cf2585ce13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 19 Apr 2020 19:53:10 -0400 Subject: [PATCH 04/11] Update output format --- README.md | 27 ++++++++++++++------------- src/prophex_query.c | 6 +++--- src/prophex_query.h | 3 +++ 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 96a917a..fa99691 100644 --- a/README.md +++ b/README.md @@ -115,19 +115,20 @@ Usage: prophex bwt2fa ## Output format -Matches are reported in an extended -[Kraken format](http://ccb.jhu.edu/software/kraken/MANUAL.html#output-format). -ProPhex produces a tab-delimited file with the following columns: - -1. Category (unused, `U` as a legacy value) -2. Sequence name -3. Final decision (unused, `0` as a legacy value) -4. Sequence length -5. Assigned k-mers. Space-delimited list of k-mer blocks with the same assignments. The list is of - the following format: comma-delimited list of sets (or `A` for ambiguous, or -   `0` for no matches), colon, length. Example: `2157,393595:1 393595:1 0:16` (the first k-mer assigned to the nodes `2157` and `393595`, the second k-mer assigned to `393595`, the subsequent 16 k-mers unassigned) -6. Bases (optional) -7. Base qualities (optional) +Matches are reported in the form of a tab-delimited file with the following +columns: + +1. Sequence name +2. Sequence length +3. Assigned k-mers. Space-delimited list of k-mer blocks matching the same + k-mer sets. The list is of the following format: comma-delimited list of + k-mer sets (`~` for an ambiguous nucleotide name `*` for no k-mer matches), + colon, the number of k-mers in the block. Example: `2157,393595:1 393595:1 + *:16` (the first k-mer assigned to the k-mer sets `2157` and `393595`, the + second k-mer assigned to `393595`, and the subsequent 16 k-mers do not match + anything) +4. Bases (optional) +5. Base qualities (optional) ## FAQs diff --git a/src/prophex_query.c b/src/prophex_query.c index ad55e79..9ecb6a5 100644 --- a/src/prophex_query.c +++ b/src/prophex_query.c @@ -154,7 +154,7 @@ void construct_streaks(char** all_streaks, char** current_streak, int* seen_kmer *current_streak[0] = '\0'; int current_streak_approximate_length = 0; if (is_ambiguous_streak) { - strcat(*current_streak, "A:"); + strcat(*current_streak, CONTAINS_AMBIG_NUCL ":"); current_streak_approximate_length += 2; } else if (kmersets_cnt > 0) { int r; @@ -167,7 +167,7 @@ void construct_streaks(char** all_streaks, char** current_streak, int* seen_kmer get_kmerset_name_length(seen_kmersets[kmersets_cnt - 1]), MAX_SOFT_STREAK_LENGTH); strncat_with_check(*current_streak, ":", ¤t_streak_approximate_length, 1, MAX_SOFT_STREAK_LENGTH); } else { - strncat_with_check(*current_streak, "0:", ¤t_streak_approximate_length, 2, MAX_SOFT_STREAK_LENGTH); + strncat_with_check(*current_streak, NO_MATCH ":", ¤t_streak_approximate_length, 2, MAX_SOFT_STREAK_LENGTH); } sprintf(*current_streak + strlen(*current_streak), "%d", streak_size); current_streak_approximate_length += 3; @@ -458,7 +458,7 @@ void process_sequences(const bwaidx_t* idx, int n_seqs, bseq1_t* seqs, const pro for (i = 0; i < n_seqs; ++i) { bseq1_t* seq = seqs + i; if (opt->output) { - fprintf(stdout, "U\t%s\t0\t%d\t", seq->name, seq->l_seq); + fprintf(stdout, "%s\t%d\t", seq->name, seq->l_seq); print_streaks(prophex_worker->output[i]); if (opt->output_read_qual) { fprintf(stdout, "\t"); diff --git a/src/prophex_query.h b/src/prophex_query.h index e9b99c5..66193e3 100644 --- a/src/prophex_query.h +++ b/src/prophex_query.h @@ -14,6 +14,9 @@ #include "klcp.h" #include "prophex_utils.h" +#define CONTAINS_AMBIG_NUCL "~" +#define NO_MATCH "*" + typedef struct { uint64_t position; int strand; From 5dae5ecfe93febc0f7a8666ebb71c46bb63ae86a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 19 Apr 2020 19:54:00 -0400 Subject: [PATCH 05/11] Update years --- LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.txt b/LICENSE.txt index 978416e..2fa1856 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License -Copyright (c) 2016-2018 Kamil Salikhov, Karel Brinda, Simone Pignotti, Gregory Kucherov +Copyright (c) 2016-2020 Kamil Salikhov, Karel Brinda, Simone Pignotti, Gregory Kucherov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 239e60f3f1e53d291f487e35fba2ac4b88bbba75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 19 Apr 2020 19:58:13 -0400 Subject: [PATCH 06/11] Reformat --- src/bitarray.c | 1 + src/bwa_utils.c | 1 + src/contig_translator.c | 2 ++ src/klcp.c | 2 ++ src/main.c | 1 + src/prophex_build.c | 2 ++ src/prophex_query.c | 26 ++++++++++++++------------ src/prophex_query.h | 1 + src/prophex_utils.c | 1 + src/prophex_utils.h | 1 + 10 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/bitarray.c b/src/bitarray.c index cd9bed2..2a15025 100644 --- a/src/bitarray.c +++ b/src/bitarray.c @@ -1,4 +1,5 @@ #include "bitarray.h" + #include #include diff --git a/src/bwa_utils.c b/src/bwa_utils.c index 2c85543..070faf8 100644 --- a/src/bwa_utils.c +++ b/src/bwa_utils.c @@ -3,6 +3,7 @@ #include #include #include + #include "bwa.h" #include "contig_translator.h" #include "khash.h" diff --git a/src/contig_translator.c b/src/contig_translator.c index 191f733..15bb843 100644 --- a/src/contig_translator.c +++ b/src/contig_translator.c @@ -1,7 +1,9 @@ #include "contig_translator.h" + #include #include #include + #include "utils.h" #define MAX_KMERSETS_COUNT 100000 diff --git a/src/klcp.c b/src/klcp.c index fd7cb2d..31435a6 100644 --- a/src/klcp.c +++ b/src/klcp.c @@ -1,7 +1,9 @@ #include "klcp.h" + #include #include #include + #include "utils.h" int32_t position_of_smallest_zero_bit[MAX_BITARRAY_BLOCK_VALUE + 1]; diff --git a/src/main.c b/src/main.c index a5fbe46..975068e 100644 --- a/src/main.c +++ b/src/main.c @@ -16,6 +16,7 @@ #include #include #include + #include "bwa.h" #include "bwa_utils.h" #include "prophex_build.h" diff --git a/src/prophex_build.c b/src/prophex_build.c index fc4baa1..9d666f8 100644 --- a/src/prophex_build.c +++ b/src/prophex_build.c @@ -1,7 +1,9 @@ #include "prophex_build.h" + #include #include #include + #include "bwa_utils.h" #include "bwt.h" #include "klcp.h" diff --git a/src/prophex_query.c b/src/prophex_query.c index 9ecb6a5..342d453 100644 --- a/src/prophex_query.c +++ b/src/prophex_query.c @@ -1,7 +1,9 @@ #include "prophex_query.h" + #include #include #include + #include "bwa.h" #include "bwa_utils.h" #include "bwase.h" @@ -94,8 +96,8 @@ void sort(int count, int** array) { } } -size_t get_kmersets_from_positions(const bwaidx_t* idx, const int query_length, const int positions_cnt, bwt_position_t* positions, int32_t* seen_kmersets, - int8_t** seen_kmersets_marks, int skip_positions_on_border) { +size_t get_kmersets_from_positions(const bwaidx_t* idx, const int query_length, const int positions_cnt, bwt_position_t* positions, + int32_t* seen_kmersets, int8_t** seen_kmersets_marks, int skip_positions_on_border) { size_t kmersets_cnt = 0; int i; for (i = 0; i < positions_cnt; ++i) { @@ -159,8 +161,8 @@ void construct_streaks(char** all_streaks, char** current_streak, int* seen_kmer } else if (kmersets_cnt > 0) { int r; for (r = 0; r < kmersets_cnt - 1; ++r) { - strncat_with_check(*current_streak, get_kmerset_name(seen_kmersets[r]), ¤t_streak_approximate_length, get_kmerset_name_length(seen_kmersets[r]), - MAX_SOFT_STREAK_LENGTH); + strncat_with_check(*current_streak, get_kmerset_name(seen_kmersets[r]), ¤t_streak_approximate_length, + get_kmerset_name_length(seen_kmersets[r]), MAX_SOFT_STREAK_LENGTH); strncat_with_check(*current_streak, ",", ¤t_streak_approximate_length, 1, MAX_SOFT_STREAK_LENGTH); } strncat_with_check(*current_streak, get_kmerset_name(seen_kmersets[kmersets_cnt - 1]), ¤t_streak_approximate_length, @@ -365,8 +367,8 @@ void process_sequence(void* data, int seq_index, int tid) { } if (end_pos - last_ambiguous_index < opt->kmer_length) { if (!is_ambiguous_streak) { - construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, is_ambiguous_streak, - &is_first_streak); + construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, + is_ambiguous_streak, &is_first_streak); is_ambiguous_streak = 1; current_streak_size = 1; } else { @@ -376,8 +378,8 @@ void process_sequence(void* data, int seq_index, int tid) { continue; } else { if (is_ambiguous_streak && current_streak_size > 0) { - construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, is_ambiguous_streak, - &is_first_streak); + construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, + is_ambiguous_streak, &is_first_streak); is_ambiguous_streak = 0; current_streak_size = 0; } @@ -415,8 +417,8 @@ void process_sequence(void* data, int seq_index, int tid) { aux_data.rids_computations++; positions_cnt = get_positions(idx, aux_data.positions, opt->kmer_length, k, l); } - kmersets_cnt = get_kmersets_from_positions(idx, opt->kmer_length, positions_cnt, aux_data.positions, seen_kmersets, &seen_kmersets_marks, - opt->skip_positions_on_border); + kmersets_cnt = get_kmersets_from_positions(idx, opt->kmer_length, positions_cnt, aux_data.positions, seen_kmersets, + &seen_kmersets_marks, opt->skip_positions_on_border); } if (opt->output_old) { output_old(seen_kmersets, kmersets_cnt); @@ -424,8 +426,8 @@ void process_sequence(void* data, int seq_index, int tid) { if (start_pos == 0 || ambiguous_streak_just_ended || (equal(kmersets_cnt, seen_kmersets, prev_kmersets_count, prev_seen_kmersets))) { current_streak_size++; } else { - construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, is_ambiguous_streak, - &is_first_streak); + construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, + is_ambiguous_streak, &is_first_streak); current_streak_size = 1; } } diff --git a/src/prophex_query.h b/src/prophex_query.h index 66193e3..399be39 100644 --- a/src/prophex_query.h +++ b/src/prophex_query.h @@ -8,6 +8,7 @@ #define PROPHEX_QUERY_H #include + #include "bwa.h" #include "bwt.h" #include "bwtaln.h" diff --git a/src/prophex_utils.c b/src/prophex_utils.c index b9982e0..8a2dc09 100644 --- a/src/prophex_utils.c +++ b/src/prophex_utils.c @@ -1,4 +1,5 @@ #include "prophex_utils.h" + #include prophex_opt_t* prophex_init_opt() { diff --git a/src/prophex_utils.h b/src/prophex_utils.h index 49fde15..788c5c9 100644 --- a/src/prophex_utils.h +++ b/src/prophex_utils.h @@ -9,6 +9,7 @@ #include #include + #include "bwtaln.h" // maximum total number base pairs in reads in one chunk From 4954b2b8097232f67de7b1cb4c9d22d1941e4ddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 19 Apr 2020 20:11:04 -0400 Subject: [PATCH 07/11] Update help messages --- README.md | 26 ++++++++++++-------------- src/main.c | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index fa99691..0052218 100644 --- a/README.md +++ b/README.md @@ -49,19 +49,19 @@ conda install prophex USAGE-BEGIN --> ``` -Program: prophex (a lossless k-mer index) -Version: 0.1.1 +Program: prophex (an exact k-mer index) +Version: 0.2.0 Authors: Kamil Salikhov, Karel Brinda, Simone Pignotti, Gregory Kucherov -Contact: kamil.salikhov@univ-mlv.fr +Contact: kamil.salikhov@univ-mlv.fr, karel.brinda@gmail.com Usage: prophex [options] -Command: index construct a BWA index and k-LCP - query query reads against index +Command: index index sequences in the FASTA format + query query k-mers - klcp construct an additional k-LCP - bwtdowngrade downgrade .bwt to the old, more compact format without Occ - bwt2fa reconstruct FASTA from BWT + klcp construct an additional k-LCP array + bwtdowngrade remove OCC from .bwt + bwt2fa reconstruct .fa from .fa.bwt ``` @@ -77,11 +77,9 @@ Options: -k INT k-mer length for k-LCP ``` Usage: prophex query [options] -Options: -k INT length of k-mer +Options: -k INT k-mer length -u use k-LCP for querying - -v output set of chromosomes for every k-mer - -p do not check whether k-mer is on border of two contigs, and show such k-mers in output - -b print sequences and base qualities + -b append sequences and base qualities to the output -l STR log file name to output statistics -t INT number of threads [1] -h print help message @@ -91,8 +89,8 @@ Options: -k INT length of k-mer ``` Usage: prophex klcp [options] -Options: -k INT length of k-mer - -s construct k-LCP and SA in parallel +Options: -k INT k-mer length + -s construct also SA, in parallel to k-LCP -i sampling distance for SA -h print help message diff --git a/src/main.c b/src/main.c index 975068e..74f7dec 100644 --- a/src/main.c +++ b/src/main.c @@ -25,19 +25,19 @@ static int usage() { fprintf(stderr, "\n"); - fprintf(stderr, "Program: prophex (a lossless k-mer index)\n"); + fprintf(stderr, "Program: prophex (an exact k-mer index)\n"); fprintf(stderr, "Version: %s\n", VERSION); fprintf(stderr, "Authors: Kamil Salikhov, Karel Brinda, Simone Pignotti, Gregory Kucherov\n"); fprintf(stderr, "Contact: kamil.salikhov@univ-mlv.fr, karel.brinda@gmail.com\n"); fprintf(stderr, "\n"); fprintf(stderr, "Usage: prophex [options]\n"); fprintf(stderr, "\n"); - fprintf(stderr, "Command: index construct a BWA index and k-LCP\n"); - fprintf(stderr, " query query reads against index\n"); + fprintf(stderr, "Command: index index sequences in the FASTA format\n"); + fprintf(stderr, " query query k-mers\n"); fprintf(stderr, "\n"); - fprintf(stderr, " klcp construct an additional k-LCP\n"); - fprintf(stderr, " bwtdowngrade downgrade .bwt to the old, more compact format without Occ\n"); - fprintf(stderr, " bwt2fa reconstruct FASTA from BWT\n"); + fprintf(stderr, " klcp construct an additional k-LCP array\n"); + fprintf(stderr, " bwtdowngrade remove OCC from .bwt\n"); + fprintf(stderr, " bwt2fa reconstruct .fa from .fa.bwt\n"); fprintf(stderr, "\n"); return 1; } @@ -46,8 +46,8 @@ static int usage_klcp() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: prophex klcp [options] \n"); fprintf(stderr, "\n"); - fprintf(stderr, "Options: -k INT length of k-mer\n"); - fprintf(stderr, " -s construct k-LCP and SA in parallel\n"); + fprintf(stderr, "Options: -k INT k-mer length\n"); + fprintf(stderr, " -s construct also SA, in parallel to k-LCP\n"); fprintf(stderr, " -i sampling distance for SA\n"); fprintf(stderr, " -h print help message\n"); fprintf(stderr, "\n"); @@ -85,11 +85,11 @@ static int usage_query(int threads) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: prophex query [options] \n"); fprintf(stderr, "\n"); - fprintf(stderr, "Options: -k INT length of k-mer\n"); + fprintf(stderr, "Options: -k INT k-mer length\n"); fprintf(stderr, " -u use k-LCP for querying\n"); - fprintf(stderr, " -v output set of chromosomes for every k-mer\n"); - fprintf(stderr, " -p do not check whether k-mer is on border of two contigs, and show such k-mers in output\n"); - fprintf(stderr, " -b print sequences and base qualities\n"); + //fprintf(stderr, " -v output matching k-mer sets for every k-mer\n"); + //fprintf(stderr, " -p do not check whether k-mer is on border of two contigs, and show such k-mers in output\n"); + fprintf(stderr, " -b append sequences and base qualities to the output\n"); fprintf(stderr, " -l STR log file name to output statistics\n"); fprintf(stderr, " -t INT number of threads [%d]\n", threads); fprintf(stderr, " -h print help message\n"); From 1280d3c295dce3281cc521f6eda2e7c2c358d9c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 19 Apr 2020 20:12:48 -0400 Subject: [PATCH 08/11] Add toc --- README.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0052218..0883f05 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,24 @@ designed as a core computational component of classifier allowing fast and accurate read assignment. + + +* [Getting started](#getting-started) + * [Alternative ways of installation](#alternative-ways-of-installation) +* [Quick example](#quick-example) +* [ProPhex commands](#prophex-commands) +* [Output format](#output-format) +* [FAQs](#faqs) +* [Issues](#issues) +* [Changelog](#changelog) +* [Licence](#licence) +* [Authors](#authors) + + + + + + ## Getting started ``` @@ -44,7 +62,7 @@ conda install prophex -# ProPhex commands +## ProPhex commands From 99acb98432d8725361928fc0afa2e1a2181e37c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Mon, 20 Apr 2020 14:23:25 -0400 Subject: [PATCH 09/11] Fix output for sequences for shorter than kmers --- src/prophex_query.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prophex_query.c b/src/prophex_query.c index 342d453..19a505b 100644 --- a/src/prophex_query.c +++ b/src/prophex_query.c @@ -350,7 +350,7 @@ void process_sequence(void* data, int seq_index, int tid) { if (start_pos + opt->kmer_length > seq.l_seq) { if (opt->output) { prophex_worker->output[seq_index] = malloc(5 * sizeof(char)); - strncpy(prophex_worker->output[seq_index], "0:0", 5); + strncpy(prophex_worker->output[seq_index], NO_MATCH ":0", 5); } } else { int index = 0; From 4ed14cf9aeb54e5e94fbf2a8a5cd969c7a92fb63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Mon, 20 Apr 2020 15:19:21 -0400 Subject: [PATCH 10/11] Add comments to process_sequence --- src/prophex_query.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/prophex_query.c b/src/prophex_query.c index 19a505b..66533db 100644 --- a/src/prophex_query.c +++ b/src/prophex_query.c @@ -359,8 +359,15 @@ void process_sequence(void* data, int seq_index, int tid) { last_ambiguous_index = index; } } + /* + Iterate through the current query sequence + */ while (start_pos + opt->kmer_length <= seq.l_seq) { int end_pos = start_pos + opt->kmer_length - 1; + /* + 1. Treat ambiguous nucleotides in queries if the current k-mer window contains any + (-> ambiguous streaks) + */ if (opt->output) { if (start_pos > 0 && seq.seq[end_pos] > 3) { last_ambiguous_index = end_pos; @@ -395,6 +402,10 @@ void process_sequence(void* data, int seq_index, int tid) { ambiguous_streak_just_ended = 0; } } + + /* + 2. Update SA interval [k,l) + */ if (start_pos == 0 || ambiguous_streak_just_ended) { k = 0; l = 0; @@ -408,18 +419,29 @@ void process_sequence(void* data, int seq_index, int tid) { calculate_sa_interval_restart(bwt, opt->kmer_length, seq.seq, &k, &l, start_pos); } } + + /* + 3. Translate: SA interval -> genomic coordinates -> k-mer set names + */ int kmersets_cnt = 0; if (k <= l) { if (prev_l - prev_k == l - k && increased_l - decreased_k == l - k) { + /* SA interval length hasn't changed -> just increment coordinates */ aux_data.using_prev_rids++; shift_positions_by_one(idx, positions_cnt, aux_data.positions, opt->kmer_length, k, l); } else { + /* SA interval length hasn changed -> recalculate coordinates */ aux_data.rids_computations++; positions_cnt = get_positions(idx, aux_data.positions, opt->kmer_length, k, l); } + /* coordinates -> k-mer set names */ kmersets_cnt = get_kmersets_from_positions(idx, opt->kmer_length, positions_cnt, aux_data.positions, seen_kmersets, &seen_kmersets_marks, opt->skip_positions_on_border); } + + /* + 4. Increment streak length or create a new one + */ if (opt->output_old) { output_old(seen_kmersets, kmersets_cnt); } else if (opt->output) { @@ -438,11 +460,14 @@ void process_sequence(void* data, int seq_index, int tid) { prev_k = k; prev_l = l; start_pos++; - } + } // while (start_pos + opt->kmer_length <= seq.l_seq) + if (current_streak_size > 0) { construct_streaks(&all_streaks, ¤t_streak, prev_seen_kmersets, prev_kmersets_count, current_streak_size, is_ambiguous_streak, &is_first_streak); } + + /* Add k-mer matches of the current query sequence to the worker buffer. */ if (opt->output) { size_t all_streaks_length = strlen(all_streaks); prophex_worker->output[seq_index] = malloc((all_streaks_length + 1) * sizeof(char)); From 96bfeb88ebfd40b372bf213016e36719269ce374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Mon, 27 Apr 2020 18:41:59 -0400 Subject: [PATCH 11/11] Dont rebuild readme automatically --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0e5068e..ad8ca18 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ IND=./prophex DEPS= $(wildcard src/*.h) $(wildcard src/*.c) $(wildcard src/bwa/.*.h) $(wildcard src/bwa/*.c) -all: prophex readme +all: prophex #readme prophex: $(DEPS) $(MAKE) -C src