From fdb11a1eb5c079076ff3520e0329a634d09ae12c Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 27 Jan 2026 12:16:14 -0800 Subject: [PATCH] chore: update to tskit-c 1.3.0 --- subprojects/tskit/VERSION.txt | 2 +- subprojects/tskit/tskit/core.c | 125 +++--- subprojects/tskit/tskit/core.h | 56 +-- subprojects/tskit/tskit/genotypes.c | 239 +++++++++++- subprojects/tskit/tskit/tables.c | 51 ++- subprojects/tskit/tskit/tables.h | 16 +- subprojects/tskit/tskit/trees.c | 578 ++++++++++++++++------------ subprojects/tskit/tskit/trees.h | 59 ++- 8 files changed, 788 insertions(+), 338 deletions(-) diff --git a/subprojects/tskit/VERSION.txt b/subprojects/tskit/VERSION.txt index 8428158dc..589268e6f 100644 --- a/subprojects/tskit/VERSION.txt +++ b/subprojects/tskit/VERSION.txt @@ -1 +1 @@ -1.1.2 \ No newline at end of file +1.3.0 \ No newline at end of file diff --git a/subprojects/tskit/tskit/core.c b/subprojects/tskit/tskit/core.c index 53cc0ce67..0f31550a7 100644 --- a/subprojects/tskit/tskit/core.c +++ b/subprojects/tskit/tskit/core.c @@ -584,6 +584,14 @@ tsk_strerror_internal(int err) ret = "Must have at least one allele when specifying an allele map. " "(TSK_ERR_ZERO_ALLELES)"; break; + case TSK_ERR_BAD_ALLELE_LENGTH: + ret = "Alleles used when decoding alignments must have length one. " + "(TSK_ERR_BAD_ALLELE_LENGTH)"; + break; + case TSK_ERR_MISSING_CHAR_COLLISION: + ret = "Alleles used when decoding alignments must not match the missing " + "data character. (TSK_ERR_MISSING_CHAR_COLLISION)"; + break; /* Distance metric errors */ case TSK_ERR_SAMPLE_SIZE_MISMATCH: @@ -1260,16 +1268,16 @@ tsk_avl_tree_int_ordered_nodes(const tsk_avl_tree_int_t *self, tsk_avl_node_int_ } // Bit Array implementation. Allows us to store unsigned integers in a compact manner. -// Currently implemented as an array of 32-bit unsigned integers for ease of counting. +// Currently implemented as an array of 32-bit unsigned integers. int -tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length) +tsk_bitset_init(tsk_bitset_t *self, tsk_size_t num_bits, tsk_size_t length) { int ret = 0; - self->size = (num_bits >> TSK_BIT_ARRAY_CHUNK) - + (num_bits % TSK_BIT_ARRAY_NUM_BITS ? 1 : 0); - self->data = tsk_calloc(self->size * length, sizeof(*self->data)); + self->row_len = (num_bits / TSK_BITSET_BITS) + (num_bits % TSK_BITSET_BITS ? 1 : 0); + self->len = length; + self->data = tsk_calloc(self->row_len * length, sizeof(*self->data)); if (self->data == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; @@ -1278,96 +1286,111 @@ tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length return ret; } -void -tsk_bit_array_get_row(const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out) -{ - out->size = self->size; - out->data = self->data + (row * self->size); -} +#define BITSET_DATA_ROW(bs, row) ((bs)->data + (row) * (bs)->row_len) void -tsk_bit_array_intersect( - const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out) +tsk_bitset_intersect(const tsk_bitset_t *self, tsk_size_t self_row, + const tsk_bitset_t *other, tsk_size_t other_row, tsk_bitset_t *out) { - for (tsk_size_t i = 0; i < self->size; i++) { - out->data[i] = self->data[i] & other->data[i]; + const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row); + const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row); + tsk_bitset_val_t *restrict out_d = out->data; + for (tsk_size_t i = 0; i < self->row_len; i++) { + out_d[i] = self_d[i] & other_d[i]; } } void -tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other) +tsk_bitset_subtract(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, + tsk_size_t other_row) { - for (tsk_size_t i = 0; i < self->size; i++) { - self->data[i] &= ~(other->data[i]); + tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row); + const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row); + for (tsk_size_t i = 0; i < self->row_len; i++) { + self_d[i] &= ~(other_d[i]); } } void -tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other) +tsk_bitset_union(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, + tsk_size_t other_row) { - for (tsk_size_t i = 0; i < self->size; i++) { - self->data[i] |= other->data[i]; + tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row); + const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row); + for (tsk_size_t i = 0; i < self->row_len; i++) { + self_d[i] |= other_d[i]; } } void -tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit) +tsk_bitset_set_bit(tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit) { - tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK; - self->data[i] |= (tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i)); + tsk_bitset_val_t i = (bit / TSK_BITSET_BITS); + *(BITSET_DATA_ROW(self, row) + i) |= (tsk_bitset_val_t) 1 + << (bit - (TSK_BITSET_BITS * i)); } bool -tsk_bit_array_contains(const tsk_bit_array_t *self, const tsk_bit_array_value_t bit) +tsk_bitset_contains(const tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit) { - tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK; - return self->data[i] - & ((tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i))); + tsk_bitset_val_t i = (bit / TSK_BITSET_BITS); + return *(BITSET_DATA_ROW(self, row) + i) + & ((tsk_bitset_val_t) 1 << (bit - (TSK_BITSET_BITS * i))); } -tsk_size_t -tsk_bit_array_count(const tsk_bit_array_t *self) +static inline uint32_t +popcount(tsk_bitset_val_t v) { - // Utilizes 12 operations per bit array. NB this only works on 32 bit integers. + // Utilizes 12 operations per chunk. NB this only works on 32 bit integers. // Taken from: // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel // There's a nice breakdown of this algorithm here: // https://stackoverflow.com/a/109025 - // Could probably do better with explicit SIMD (instead of SWAR), but not as - // portable: https://arxiv.org/pdf/1611.07612.pdf // - // There is one solution to explore further, which uses __builtin_popcountll. - // This option is relatively simple, but requires a 64 bit bit array and also - // involves some compiler flag plumbing (-mpopcnt) + // The gcc/clang compiler flag will -mpopcnt will convert this code to a + // popcnt instruction (most if not all modern CPUs will support this). The + // popcnt instruction will yield some speed improvements, which depend on + // the tree sequence. + // + // NB: 32bit counting is typically faster than 64bit counting for this task. + // (at least on x86-64) + + v = v - ((v >> 1) & 0x55555555); + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + return (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; +} - tsk_bit_array_value_t tmp; - tsk_size_t i, count = 0; +tsk_size_t +tsk_bitset_count(const tsk_bitset_t *self, tsk_size_t row) +{ + tsk_size_t i = 0; + tsk_size_t count = 0; + const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, row); - for (i = 0; i < self->size; i++) { - tmp = self->data[i] - ((self->data[i] >> 1) & 0x55555555); - tmp = (tmp & 0x33333333) + ((tmp >> 2) & 0x33333333); - count += (((tmp + (tmp >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; + for (i = 0; i < self->row_len; i++) { + count += popcount(self_d[i]); } return count; } void -tsk_bit_array_get_items( - const tsk_bit_array_t *self, tsk_id_t *items, tsk_size_t *n_items) +tsk_bitset_get_items( + const tsk_bitset_t *self, tsk_size_t row, tsk_id_t *items, tsk_size_t *n_items) { // Get the items stored in the row of a bitset. - // Uses a de Bruijn sequence lookup table to determine the lowest bit set. See the - // wikipedia article for more info: https://w.wiki/BYiF + // Uses a de Bruijn sequence lookup table to determine the lowest bit set. + // See the wikipedia article for more info: https://w.wiki/BYiF tsk_size_t i, n, off; - tsk_bit_array_value_t v, lsb; // least significant bit + tsk_bitset_val_t v, lsb; // least significant bit static const tsk_id_t lookup[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; + const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, row); n = 0; - for (i = 0; i < self->size; i++) { - v = self->data[i]; - off = i * ((tsk_size_t) TSK_BIT_ARRAY_NUM_BITS); + for (i = 0; i < self->row_len; i++) { + v = self_d[i]; + off = i * TSK_BITSET_BITS; if (v == 0) { continue; } @@ -1381,7 +1404,7 @@ tsk_bit_array_get_items( } void -tsk_bit_array_free(tsk_bit_array_t *self) +tsk_bitset_free(tsk_bitset_t *self) { tsk_safe_free(self->data); } diff --git a/subprojects/tskit/tskit/core.h b/subprojects/tskit/tskit/core.h index 7dd24eba5..481905b7a 100644 --- a/subprojects/tskit/tskit/core.h +++ b/subprojects/tskit/tskit/core.h @@ -147,7 +147,7 @@ sizes and types of externally visible structs. The library minor version. Incremented when non-breaking backward-compatible changes to the API or ABI are introduced, i.e., the addition of a new function. */ -#define TSK_VERSION_MINOR 2 +#define TSK_VERSION_MINOR 3 /** The library patch version. Incremented when any changes not relevant to the to the API or ABI are introduced, i.e., internal refactors of bugfixes. @@ -803,6 +803,14 @@ More than 2147483647 alleles were specified. A user-specified allele map was used, but it contained zero alleles. */ #define TSK_ERR_ZERO_ALLELES -1103 +/** +An allele used when decoding alignments had length other than one. +*/ +#define TSK_ERR_BAD_ALLELE_LENGTH -1104 +/** +An allele used when decoding alignments matched the missing data character. +*/ +#define TSK_ERR_MISSING_CHAR_COLLISION -1105 /** @} */ /** @@ -1104,29 +1112,31 @@ FILE *tsk_get_debug_stream(void); /* Bit Array functionality */ -typedef uint32_t tsk_bit_array_value_t; +// define a 32-bit chunk size for our bitsets. +// this means we'll be able to hold 32 distinct items in each 32 bit uint +#define TSK_BITSET_BITS ((tsk_size_t) 32) +typedef uint32_t tsk_bitset_val_t; + typedef struct { - tsk_size_t size; // Number of chunks per row - tsk_bit_array_value_t *data; // Array data -} tsk_bit_array_t; - -#define TSK_BIT_ARRAY_CHUNK 5U -#define TSK_BIT_ARRAY_NUM_BITS (1U << TSK_BIT_ARRAY_CHUNK) - -int tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length); -void tsk_bit_array_free(tsk_bit_array_t *self); -void tsk_bit_array_get_row( - const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out); -void tsk_bit_array_intersect( - const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out); -void tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other); -void tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other); -void tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit); -bool tsk_bit_array_contains( - const tsk_bit_array_t *self, const tsk_bit_array_value_t bit); -tsk_size_t tsk_bit_array_count(const tsk_bit_array_t *self); -void tsk_bit_array_get_items( - const tsk_bit_array_t *self, tsk_id_t *items, tsk_size_t *n_items); + tsk_size_t row_len; // Number of size TSK_BITSET_BITS chunks per row + tsk_size_t len; // Number of rows + tsk_bitset_val_t *data; +} tsk_bitset_t; + +int tsk_bitset_init(tsk_bitset_t *self, tsk_size_t num_bits, tsk_size_t length); +void tsk_bitset_free(tsk_bitset_t *self); +void tsk_bitset_intersect(const tsk_bitset_t *self, tsk_size_t self_row, + const tsk_bitset_t *other, tsk_size_t other_row, tsk_bitset_t *out); +void tsk_bitset_subtract(tsk_bitset_t *self, tsk_size_t self_row, + const tsk_bitset_t *other, tsk_size_t other_row); +void tsk_bitset_union(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, + tsk_size_t other_row); +void tsk_bitset_set_bit(tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit); +bool tsk_bitset_contains( + const tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit); +tsk_size_t tsk_bitset_count(const tsk_bitset_t *self, tsk_size_t row); +void tsk_bitset_get_items( + const tsk_bitset_t *self, tsk_size_t row, tsk_id_t *items, tsk_size_t *n_items); #ifdef __cplusplus } diff --git a/subprojects/tskit/tskit/genotypes.c b/subprojects/tskit/tskit/genotypes.c index c2385281b..304fb3f36 100644 --- a/subprojects/tskit/tskit/genotypes.c +++ b/subprojects/tskit/tskit/genotypes.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -90,12 +91,10 @@ tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles) static int variant_init_samples_and_index_map(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, - size_t num_samples_alloc, tsk_flags_t options) + size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; - const tsk_flags_t *flags = tree_sequence->tables->nodes.flags; tsk_size_t j, num_nodes; - bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING); tsk_id_t u; num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); @@ -120,11 +119,6 @@ variant_init_samples_and_index_map(tsk_variant_t *self, ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } - /* We can only detect missing data for samples */ - if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) { - ret = tsk_trace_error(TSK_ERR_MUST_IMPUTE_NON_SAMPLES); - goto out; - } self->alt_sample_index_map[samples[j]] = (tsk_id_t) j; } out: @@ -439,6 +433,27 @@ tsk_variant_mark_missing(tsk_variant_t *self) return num_missing; } +/* Mark missing for any requested node (sample or non-sample) that is isolated + * in the current tree, i.e., has no parent and no children at this position. */ +static tsk_size_t +tsk_variant_mark_missing_any(tsk_variant_t *self) +{ + tsk_size_t num_missing = 0; + int32_t *restrict genotypes = self->genotypes; + const tsk_id_t *restrict parent = self->tree.parent; + const tsk_id_t *restrict left_child = self->tree.left_child; + tsk_size_t j; + + for (j = 0; j < self->num_samples; j++) { + tsk_id_t u = self->samples[j]; + if (parent[u] == TSK_NULL && left_child[u] == TSK_NULL) { + genotypes[j] = TSK_MISSING_DATA; + num_missing++; + } + } + return num_missing; +} + static tsk_id_t tsk_variant_get_allele_index(tsk_variant_t *self, const char *allele, tsk_size_t length) { @@ -502,6 +517,10 @@ tsk_variant_decode( update_genotypes = tsk_variant_update_genotypes_sample_list; if (by_traversal) { update_genotypes = tsk_variant_update_genotypes_traversal; + /* When decoding a user-provided list of nodes (which may include + * non-samples), mark isolated nodes as missing directly by checking + * isolation status for each requested node. */ + mark_missing = tsk_variant_mark_missing_any; } if (self->user_alleles) { @@ -644,3 +663,207 @@ tsk_vargen_next(tsk_vargen_t *self, tsk_variant_t **variant) out: return ret; } + +static int +tsk_treeseq_decode_alignments_overlay_missing(const tsk_treeseq_t *self, + const tsk_id_t *nodes, tsk_size_t num_nodes, double left, double right, + char missing_data_character, tsk_size_t L, char *alignments_out) +{ + int ret = 0; + tsk_tree_t tree; + tsk_size_t i, seg_left, seg_right; + char *row = NULL; + tsk_id_t u; + + tsk_memset(&tree, 0, sizeof(tree)); + + ret = tsk_tree_init(&tree, self, 0); + if (ret != 0) { + goto out; + } + ret = tsk_tree_seek(&tree, left, 0); + if (ret != 0) { + goto out; + } + while (tree.index != -1 && tree.interval.left < right) { + seg_left = TSK_MAX((tsk_size_t) tree.interval.left, (tsk_size_t) left); + seg_right = TSK_MIN((tsk_size_t) tree.interval.right, (tsk_size_t) right); + if (seg_right > seg_left) { + for (i = 0; i < num_nodes; i++) { + u = nodes[i]; + if (tree.parent[u] == TSK_NULL && tree.left_child[u] == TSK_NULL) { + row = alignments_out + i * L; + /* memset takes an `int`, `missing_data_character` is a `char` which + * can be signed or unsigned depending on the platform, so we need to + * cast. Some tools/compilers will warn if we just cast + * to `unsigned char` and leave the cast to `int` as implicit, hence + * the double cast. */ + tsk_memset(row + (seg_left - (tsk_size_t) left), + (int) (unsigned char) missing_data_character, + seg_right - seg_left); + } + } + } + ret = tsk_tree_next(&tree); + if (ret < 0) { + goto out; + } + } + + /* On success we should return 0, not TSK_TREE_OK from the last tsk_tree_next */ + ret = 0; +out: + tsk_tree_free(&tree); + return ret; +} + +static int +tsk_treeseq_decode_alignments_overlay_sites(const tsk_treeseq_t *self, + const tsk_id_t *nodes, tsk_size_t num_nodes, double left, double right, + char missing_data_character, tsk_size_t L, char *alignments_out, tsk_flags_t options) +{ + int ret = 0; + tsk_variant_t var; + tsk_id_t site_id; + tsk_site_t site; + char *allele_byte = NULL; + tsk_size_t allele_cap = 0; + tsk_size_t i, j; + char *row = NULL; + int32_t g; + char c; + char *tmp = NULL; + + tsk_memset(&var, 0, sizeof(var)); + + ret = tsk_variant_init(&var, self, nodes, num_nodes, NULL, options); + if (ret != 0) { + goto out; + } + for (site_id = 0; site_id < (tsk_id_t) tsk_treeseq_get_num_sites(self); site_id++) { + ret = tsk_treeseq_get_site(self, site_id, &site); + if (ret != 0) { + goto out; + } + if (site.position < left) { + continue; + } + if (site.position >= right) { + break; + } + ret = tsk_variant_decode(&var, site_id, 0); + if (ret != 0) { + goto out; + } + if (var.num_alleles > 0) { + if (var.num_alleles > allele_cap) { + tmp = tsk_realloc(allele_byte, var.num_alleles * sizeof(*allele_byte)); + if (tmp == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; + } + allele_byte = tmp; + allele_cap = var.num_alleles; + } + for (j = 0; j < var.num_alleles; j++) { + if (var.allele_lengths[j] != 1) { + ret = tsk_trace_error(TSK_ERR_BAD_ALLELE_LENGTH); + goto out; + } + allele_byte[j] = var.alleles[j][0]; + if (allele_byte[j] == missing_data_character) { + ret = tsk_trace_error(TSK_ERR_MISSING_CHAR_COLLISION); + goto out; + } + } + for (i = 0; i < num_nodes; i++) { + row = alignments_out + i * L; + g = var.genotypes[i]; + c = missing_data_character; + if (g != TSK_MISSING_DATA) { + tsk_bug_assert(g >= 0); + tsk_bug_assert((tsk_size_t) g < var.num_alleles); + c = allele_byte[g]; + } + row[((tsk_size_t) site.position) - (tsk_size_t) left] = (char) c; + } + } + } + +out: + tsk_safe_free(allele_byte); + tsk_variant_free(&var); + return ret; +} + +/* NOTE: We usually keep functions with a tsk_treeseq_t signature in trees.c. + * tsk_treeseq_decode_alignments is implemented here instead because it + * depends directly on tsk_variant_t and the genotype/allele machinery in + * this file (and thus on genotypes.h). This slightly breaks that layering + * convention but keeps the implementation close to the variant code. */ +int +tsk_treeseq_decode_alignments(const tsk_treeseq_t *self, const char *ref_seq, + tsk_size_t ref_seq_length, const tsk_id_t *nodes, tsk_size_t num_nodes, double left, + double right, char missing_data_character, char *alignments_out, tsk_flags_t options) +{ + int ret = 0; + tsk_size_t i, L; + char *row = NULL; + + if (!tsk_treeseq_get_discrete_genome(self)) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + if (ref_seq == NULL) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + if (ref_seq_length != (tsk_size_t) tsk_treeseq_get_sequence_length(self)) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + if (trunc(left) != left || trunc(right) != right) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + if (left < 0 || right > tsk_treeseq_get_sequence_length(self) + || (tsk_size_t) left >= (tsk_size_t) right) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + L = (tsk_size_t) right - (tsk_size_t) left; + if (num_nodes == 0) { + return 0; + } + if (nodes == NULL || alignments_out == NULL) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + for (i = 0; i < num_nodes; i++) { + if (nodes[i] < 0 || nodes[i] >= (tsk_id_t) tsk_treeseq_get_num_nodes(self)) { + ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); + goto out; + } + } + + /* Fill rows with the reference slice */ + for (i = 0; i < num_nodes; i++) { + row = alignments_out + i * L; + tsk_memcpy(row, ref_seq + (tsk_size_t) left, L); + } + if (!(options & TSK_ISOLATED_NOT_MISSING)) { + ret = tsk_treeseq_decode_alignments_overlay_missing(self, nodes, num_nodes, left, + right, missing_data_character, L, alignments_out); + if (ret != 0) { + goto out; + } + } + ret = tsk_treeseq_decode_alignments_overlay_sites(self, nodes, num_nodes, left, + right, missing_data_character, L, alignments_out, options); + if (ret != 0) { + goto out; + } + +out: + return ret; +} diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index 9805d669a..8b92be497 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -11078,7 +11078,7 @@ tsk_table_collection_check_integrity( | TSK_CHECK_MIGRATION_ORDERING | TSK_CHECK_INDEXES; } - if (self->sequence_length <= 0) { + if (!tsk_isfinite(self->sequence_length) || self->sequence_length <= 0) { ret = tsk_trace_error(TSK_ERR_BAD_SEQUENCE_LENGTH); goto out; } @@ -12436,8 +12436,27 @@ tsk_table_collection_compute_mutation_parents( tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; + tsk_mutation_table_t *mutations = &self->mutations; + tsk_id_t *parent_backup = NULL; + bool restore_parents = false; if (!(options & TSK_NO_CHECK_INTEGRITY)) { + if (mutations->num_rows > 0) { + /* We need to wipe the parent column before computing, as otherwise invalid + * parents can cause integrity checks to fail. We take a copy to restore on + * error */ + parent_backup = tsk_malloc(mutations->num_rows * sizeof(*parent_backup)); + if (parent_backup == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; + } + tsk_memcpy(parent_backup, mutations->parent, + mutations->num_rows * sizeof(*parent_backup)); + /* Set the parent pointers to TSK_NULL */ + tsk_memset(mutations->parent, 0xff, + mutations->num_rows * sizeof(*mutations->parent)); + restore_parents = true; + } /* Safe to cast here as we're not counting trees */ ret = (int) tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); if (ret < 0) { @@ -12452,6 +12471,11 @@ tsk_table_collection_compute_mutation_parents( } out: + if (ret != 0 && restore_parents) { + tsk_memcpy(mutations->parent, parent_backup, + mutations->num_rows * sizeof(*parent_backup)); + } + tsk_safe_free(parent_backup); return ret; } @@ -13202,6 +13226,8 @@ tsk_table_collection_union(tsk_table_collection_t *self, tsk_id_t *site_map = NULL; bool add_populations = !(options & TSK_UNION_NO_ADD_POP); bool check_shared_portion = !(options & TSK_UNION_NO_CHECK_SHARED); + bool all_edges = !!(options & TSK_UNION_ALL_EDGES); + bool all_mutations = !!(options & TSK_UNION_ALL_MUTATIONS); /* Not calling TSK_CHECK_TREES so casting to int is safe */ ret = (int) tsk_table_collection_check_integrity(self, 0); @@ -13285,7 +13311,7 @@ tsk_table_collection_union(tsk_table_collection_t *self, // edges for (k = 0; k < (tsk_id_t) other->edges.num_rows; k++) { tsk_edge_table_get_row_unsafe(&other->edges, k, &edge); - if ((other_node_mapping[edge.parent] == TSK_NULL) + if (all_edges || (other_node_mapping[edge.parent] == TSK_NULL) || (other_node_mapping[edge.child] == TSK_NULL)) { new_parent = node_map[edge.parent]; new_child = node_map[edge.child]; @@ -13298,14 +13324,31 @@ tsk_table_collection_union(tsk_table_collection_t *self, } } - // mutations and sites + // sites + // first do the "disjoint" (all_mutations) case, where we just add all sites; + // otherwise we want to just add sites for new mutations + if (all_mutations) { + for (k = 0; k < (tsk_id_t) other->sites.num_rows; k++) { + tsk_site_table_get_row_unsafe(&other->sites, k, &site); + ret_id = tsk_site_table_add_row(&self->sites, site.position, + site.ancestral_state, site.ancestral_state_length, site.metadata, + site.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + site_map[site.id] = ret_id; + } + } + + // mutations (and maybe sites) i = 0; for (k = 0; k < (tsk_id_t) other->sites.num_rows; k++) { tsk_site_table_get_row_unsafe(&other->sites, k, &site); while ((i < (tsk_id_t) other->mutations.num_rows) && (other->mutations.site[i] == site.id)) { tsk_mutation_table_get_row_unsafe(&other->mutations, i, &mut); - if (other_node_mapping[mut.node] == TSK_NULL) { + if (all_mutations || (other_node_mapping[mut.node] == TSK_NULL)) { if (site_map[site.id] == TSK_NULL) { ret_id = tsk_site_table_add_row(&self->sites, site.position, site.ancestral_state, site.ancestral_state_length, site.metadata, diff --git a/subprojects/tskit/tskit/tables.h b/subprojects/tskit/tskit/tables.h index 85ed29d58..9523ee127 100644 --- a/subprojects/tskit/tskit/tables.h +++ b/subprojects/tskit/tskit/tables.h @@ -858,11 +858,21 @@ equality of the subsets. */ #define TSK_UNION_NO_CHECK_SHARED (1 << 0) /** - By default, all nodes new to ``self`` are assigned new populations. If this +By default, all nodes new to ``self`` are assigned new populations. If this option is specified, nodes that are added to ``self`` will retain the population IDs they have in ``other``. */ #define TSK_UNION_NO_ADD_POP (1 << 1) +/** +By default, union only adds edges adjacent to a newly added node; +this option adds all edges. + */ +#define TSK_UNION_ALL_EDGES (1 << 2) +/** +By default, union only adds only mutations on newly added edges, and +sites for those mutations; this option adds all mutations and all sites. + */ +#define TSK_UNION_ALL_MUTATIONS (1 << 3) /** @} */ /** @@ -4414,6 +4424,10 @@ that are exclusive ``other`` are added to ``self``, along with: By default, populations of newly added nodes are assumed to be new populations, and added to the population table as well. +The behavior can be changed by the flags ``TSK_UNION_ALL_EDGES`` and +``TSK_UNION_ALL_MUTATIONS``, which will (respectively) add *all* edges +or *all* sites and mutations instead. + This operation will also sort the resulting tables, so the tables may change even if nothing new is added, if the original tables were not sorted. diff --git a/subprojects/tskit/tskit/trees.c b/subprojects/tskit/tskit/trees.c index 7a159a7fe..fdc043ecd 100644 --- a/subprojects/tskit/tskit/trees.c +++ b/subprojects/tskit/tskit/trees.c @@ -31,6 +31,7 @@ #include #include +#include static inline bool is_discrete(double x) @@ -2223,20 +2224,18 @@ tsk_treeseq_sample_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_s ***********************************/ static int -get_allele_samples(const tsk_site_t *site, const tsk_bit_array_t *state, - tsk_bit_array_t *out_allele_samples, tsk_size_t *out_num_alleles) +get_allele_samples(const tsk_site_t *site, tsk_size_t site_offset, + const tsk_bitset_t *state, tsk_bitset_t *out_allele_samples, + tsk_size_t *out_num_alleles) { int ret = 0; tsk_mutation_t mutation, parent_mut; - tsk_size_t mutation_index, allele, alt_allele_length; + tsk_size_t mutation_index, allele, alt_allele, alt_allele_length; /* The allele table */ tsk_size_t max_alleles = site->mutations_length + 1; const char **alleles = tsk_malloc(max_alleles * sizeof(*alleles)); tsk_size_t *allele_lengths = tsk_calloc(max_alleles, sizeof(*allele_lengths)); - const char *alt_allele; - tsk_bit_array_t state_row; - tsk_bit_array_t allele_samples_row; - tsk_bit_array_t alt_allele_samples_row; + const char *alt_allele_state; tsk_size_t num_alleles = 1; if (alleles == NULL || allele_lengths == NULL) { @@ -2267,29 +2266,29 @@ get_allele_samples(const tsk_site_t *site, const tsk_bit_array_t *state, } /* Add the mutation's samples to this allele */ - tsk_bit_array_get_row(out_allele_samples, allele, &allele_samples_row); - tsk_bit_array_get_row(state, mutation_index, &state_row); - tsk_bit_array_add(&allele_samples_row, &state_row); + tsk_bitset_union( + out_allele_samples, allele + site_offset, state, mutation_index); /* Get the index for the alternate allele that we must subtract from */ - alt_allele = site->ancestral_state; + alt_allele_state = site->ancestral_state; alt_allele_length = site->ancestral_state_length; if (mutation.parent != TSK_NULL) { parent_mut = site->mutations[mutation.parent - site->mutations[0].id]; - alt_allele = parent_mut.derived_state; + alt_allele_state = parent_mut.derived_state; alt_allele_length = parent_mut.derived_state_length; } - for (allele = 0; allele < num_alleles; allele++) { - if (alt_allele_length == allele_lengths[allele] - && tsk_memcmp(alt_allele, alleles[allele], allele_lengths[allele]) + for (alt_allele = 0; alt_allele < num_alleles; alt_allele++) { + if (alt_allele_length == allele_lengths[alt_allele] + && tsk_memcmp( + alt_allele_state, alleles[alt_allele], allele_lengths[alt_allele]) == 0) { break; } } tsk_bug_assert(allele < num_alleles); - tsk_bit_array_get_row(out_allele_samples, allele, &alt_allele_samples_row); - tsk_bit_array_subtract(&alt_allele_samples_row, &allele_samples_row); + tsk_bitset_subtract(out_allele_samples, alt_allele + site_offset, + out_allele_samples, allele + site_offset); } *out_num_alleles = num_alleles; out: @@ -2310,7 +2309,6 @@ norm_hap_weighted(tsk_size_t result_dim, const double *hap_weights, for (k = 0; k < result_dim; k++) { weight_row = GET_2D_ROW(hap_weights, 3, k); n = (double) args.sample_set_sizes[k]; - // TODO: what to do when n = 0 result[k] = weight_row[0] / n; } return 0; @@ -2347,111 +2345,108 @@ norm_total_weighted(tsk_size_t result_dim, const double *TSK_UNUSED(hap_weights) tsk_size_t n_a, tsk_size_t n_b, double *result, void *TSK_UNUSED(params)) { tsk_size_t k; + double norm = 1 / (double) (n_a * n_b); for (k = 0; k < result_dim; k++) { - result[k] = 1 / (double) (n_a * n_b); + result[k] = norm; } return 0; } static void -get_all_samples_bits(tsk_bit_array_t *all_samples, tsk_size_t n) +get_all_samples_bits(tsk_bitset_t *all_samples, tsk_size_t n) { tsk_size_t i; - const tsk_bit_array_value_t all = ~((tsk_bit_array_value_t) 0); - const tsk_bit_array_value_t remainder_samples = n % TSK_BIT_ARRAY_NUM_BITS; + const tsk_bitset_val_t all = ~((tsk_bitset_val_t) 0); + const tsk_bitset_val_t remainder_samples = n % TSK_BITSET_BITS; - all_samples->data[all_samples->size - 1] + all_samples->data[all_samples->row_len - 1] = remainder_samples ? ~(all << remainder_samples) : all; - for (i = 0; i < all_samples->size - 1; i++) { + for (i = 0; i < all_samples->row_len - 1; i++) { all_samples->data[i] = all; } } +// Stores the intermediate values for computing two-locus statistics. +typedef struct { + double *weights; + double *norm; + double *result_tmp; + tsk_bitset_t AB_samples; +} two_locus_work_t; + static int -compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, - const tsk_bit_array_t *site_b_state, tsk_size_t num_a_alleles, - tsk_size_t num_b_alleles, tsk_size_t num_samples, tsk_size_t state_dim, - const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, - sample_count_stat_params_t *f_params, norm_func_t *norm_f, bool polarised, - double *result) +two_locus_work_init(tsk_size_t max_alleles, tsk_size_t num_samples, + tsk_size_t result_dim, tsk_size_t state_dim, two_locus_work_t *out) { int ret = 0; - tsk_bit_array_t A_samples, B_samples; - // ss_ prefix refers to a sample set - tsk_bit_array_t ss_row; - tsk_bit_array_t ss_A_samples, ss_B_samples, ss_AB_samples, AB_samples; - // Sample sets and b sites are rows, a sites are columns - // b1 b2 b3 - // a1 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] - // a2 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] - // a3 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] - tsk_size_t k, mut_a, mut_b; - tsk_size_t result_row_len = num_b_alleles * result_dim; - tsk_size_t w_A = 0, w_B = 0, w_AB = 0; - uint8_t polarised_val = polarised ? 1 : 0; - double *hap_weight_row; - double *result_tmp_row; - double *weights = tsk_malloc(3 * state_dim * sizeof(*weights)); - double *norm = tsk_malloc(result_dim * sizeof(*norm)); - double *result_tmp - = tsk_malloc(result_row_len * num_a_alleles * sizeof(*result_tmp)); - - tsk_memset(&ss_A_samples, 0, sizeof(ss_A_samples)); - tsk_memset(&ss_B_samples, 0, sizeof(ss_B_samples)); - tsk_memset(&ss_AB_samples, 0, sizeof(ss_AB_samples)); - tsk_memset(&AB_samples, 0, sizeof(AB_samples)); - - if (weights == NULL || norm == NULL || result_tmp == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } - ret = tsk_bit_array_init(&ss_A_samples, num_samples, 1); - if (ret != 0) { - goto out; - } - ret = tsk_bit_array_init(&ss_B_samples, num_samples, 1); - if (ret != 0) { - goto out; - } - ret = tsk_bit_array_init(&ss_AB_samples, num_samples, 1); - if (ret != 0) { + out->weights = tsk_malloc(3 * state_dim * sizeof(*out->weights)); + out->norm = tsk_malloc(result_dim * sizeof(*out->norm)); + out->result_tmp + = tsk_malloc(result_dim * max_alleles * max_alleles * sizeof(*out->result_tmp)); + tsk_memset(&out->AB_samples, 0, sizeof(out->AB_samples)); + if (out->weights == NULL || out->norm == NULL || out->result_tmp == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } - ret = tsk_bit_array_init(&AB_samples, num_samples, 1); + ret = tsk_bitset_init(&out->AB_samples, num_samples, 1); if (ret != 0) { goto out; } +out: + return ret; +} + +static void +two_locus_work_free(two_locus_work_t *work) +{ + tsk_safe_free(work->weights); + tsk_safe_free(work->norm); + tsk_safe_free(work->result_tmp); + tsk_bitset_free(&work->AB_samples); +} - for (mut_a = polarised_val; mut_a < num_a_alleles; mut_a++) { +static int +compute_general_normed_two_site_stat_result(const tsk_bitset_t *state, + const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off, + tsk_size_t num_a_alleles, tsk_size_t num_b_alleles, tsk_size_t state_dim, + tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, + norm_func_t *norm_f, bool polarised, two_locus_work_t *restrict work, double *result) +{ + int ret = 0; + // Sample sets and b sites are rows, a sites are columns + // b1 b2 b3 + // a1 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + // a2 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + // a3 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + tsk_size_t k, mut_a, mut_b, result_row_len = num_b_alleles * result_dim; + uint8_t is_polarised = polarised ? 1 : 0; + double *restrict hap_row, *restrict result_tmp_row; + double *restrict norm = work->norm; + double *restrict weights = work->weights; + double *restrict result_tmp = work->result_tmp; + tsk_bitset_t AB_samples = work->AB_samples; + + for (mut_a = is_polarised; mut_a < num_a_alleles; mut_a++) { result_tmp_row = GET_2D_ROW(result_tmp, result_row_len, mut_a); - for (mut_b = polarised_val; mut_b < num_b_alleles; mut_b++) { - tsk_bit_array_get_row(site_a_state, mut_a, &A_samples); - tsk_bit_array_get_row(site_b_state, mut_b, &B_samples); - tsk_bit_array_intersect(&A_samples, &B_samples, &AB_samples); + for (mut_b = is_polarised; mut_b < num_b_alleles; mut_b++) { for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row(sample_sets, k, &ss_row); - hap_weight_row = GET_2D_ROW(weights, 3, k); - - tsk_bit_array_intersect(&A_samples, &ss_row, &ss_A_samples); - tsk_bit_array_intersect(&B_samples, &ss_row, &ss_B_samples); - tsk_bit_array_intersect(&AB_samples, &ss_row, &ss_AB_samples); - - w_AB = tsk_bit_array_count(&ss_AB_samples); - w_A = tsk_bit_array_count(&ss_A_samples); - w_B = tsk_bit_array_count(&ss_B_samples); - - hap_weight_row[0] = (double) w_AB; - hap_weight_row[1] = (double) (w_A - w_AB); // w_Ab - hap_weight_row[2] = (double) (w_B - w_AB); // w_aB + tsk_bitset_intersect(state, a_off + (mut_a * state_dim) + k, state, + b_off + (mut_b * state_dim) + k, &AB_samples); + hap_row = GET_2D_ROW(weights, 3, k); + hap_row[0] = (double) tsk_bitset_count(&AB_samples, 0); + hap_row[1] = (double) allele_counts[a_off + (mut_a * state_dim) + k] + - hap_row[0]; + hap_row[2] = (double) allele_counts[b_off + (mut_b * state_dim) + k] + - hap_row[0]; } ret = f(state_dim, weights, result_dim, result_tmp_row, f_params); if (ret != 0) { goto out; } - ret = norm_f(result_dim, weights, num_a_alleles - polarised_val, - num_b_alleles - polarised_val, norm, f_params); + ret = norm_f(result_dim, weights, num_a_alleles - is_polarised, + num_b_alleles - is_polarised, norm, f_params); if (ret != 0) { goto out; } @@ -2461,15 +2456,38 @@ compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, result_tmp_row += result_dim; // Advance to the next column } } +out: + return ret; +} + +static int +compute_general_two_site_stat_result(const tsk_bitset_t *state, + const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off, + tsk_size_t state_dim, tsk_size_t result_dim, general_stat_func_t *f, + sample_count_stat_params_t *f_params, two_locus_work_t *restrict work, + double *result) +{ + int ret = 0; + tsk_size_t k; + tsk_bitset_t AB_samples = work->AB_samples; + tsk_size_t mut_a = 1, mut_b = 1; + double *restrict hap_row, *restrict weights = work->weights; + for (k = 0; k < state_dim; k++) { + tsk_bitset_intersect(state, a_off + (mut_a * state_dim) + k, state, + b_off + (mut_b * state_dim) + k, &AB_samples); + hap_row = GET_2D_ROW(weights, 3, k); + hap_row[0] = (double) tsk_bitset_count(&AB_samples, 0); + hap_row[1] + = (double) allele_counts[a_off + (mut_a * state_dim) + k] - hap_row[0]; + hap_row[2] + = (double) allele_counts[b_off + (mut_b * state_dim) + k] - hap_row[0]; + } + ret = f(state_dim, weights, result_dim, result, f_params); + if (ret != 0) { + goto out; + } out: - tsk_safe_free(weights); - tsk_safe_free(norm); - tsk_safe_free(result_tmp); - tsk_bit_array_free(&ss_A_samples); - tsk_bit_array_free(&ss_B_samples); - tsk_bit_array_free(&ss_AB_samples); - tsk_bit_array_free(&AB_samples); return ret; } @@ -2520,7 +2538,7 @@ get_site_row_col_indices(tsk_size_t n_rows, const tsk_id_t *row_sites, tsk_size_ static int get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t n_sites, - tsk_size_t *num_alleles, tsk_bit_array_t *allele_samples) + tsk_size_t *num_alleles, tsk_bitset_t *allele_samples) { int ret = 0; const tsk_flags_t *restrict flags = ts->tables->nodes.flags; @@ -2528,7 +2546,7 @@ get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t const tsk_size_t *restrict site_muts_len = ts->site_mutations_length; tsk_site_t site; tsk_tree_t tree; - tsk_bit_array_t all_samples_bits, mut_samples, mut_samples_row, out_row; + tsk_bitset_t all_samples_bits, mut_samples; tsk_size_t max_muts_len, site_offset, num_nodes, site_idx, s, m, n; tsk_id_t node, *nodes = NULL; void *tmp_nodes; @@ -2538,16 +2556,14 @@ get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t max_muts_len = 0; for (s = 0; s < n_sites; s++) { - if (site_muts_len[sites[s]] > max_muts_len) { - max_muts_len = site_muts_len[sites[s]]; - } + max_muts_len = TSK_MAX(site_muts_len[sites[s]], max_muts_len); } // Allocate a bit array of size max alleles for all sites - ret = tsk_bit_array_init(&mut_samples, num_samples, max_muts_len); + ret = tsk_bitset_init(&mut_samples, num_samples, max_muts_len); if (ret != 0) { goto out; } - ret = tsk_bit_array_init(&all_samples_bits, num_samples, 1); + ret = tsk_bitset_init(&all_samples_bits, num_samples, 1); if (ret != 0) { goto out; } @@ -2572,15 +2588,11 @@ get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t goto out; } nodes = tmp_nodes; - - tsk_bit_array_get_row(allele_samples, site_offset, &out_row); - tsk_bit_array_add(&out_row, &all_samples_bits); - + tsk_bitset_union(allele_samples, site_offset, &all_samples_bits, 0); // Zero out results before the start of each iteration tsk_memset(mut_samples.data, 0, - mut_samples.size * max_muts_len * sizeof(tsk_bit_array_value_t)); + mut_samples.row_len * max_muts_len * sizeof(tsk_bitset_val_t)); for (m = 0; m < site.mutations_length; m++) { - tsk_bit_array_get_row(&mut_samples, m, &mut_samples_row); node = site.mutations[m].node; ret = tsk_tree_preorder_from(&tree, node, nodes, &num_nodes); if (ret != 0) { @@ -2589,43 +2601,77 @@ get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t for (n = 0; n < num_nodes; n++) { node = nodes[n]; if (flags[node] & TSK_NODE_IS_SAMPLE) { - tsk_bit_array_add_bit(&mut_samples_row, - (tsk_bit_array_value_t) ts->sample_index_map[node]); + tsk_bitset_set_bit( + &mut_samples, m, (tsk_bitset_val_t) ts->sample_index_map[node]); } } } + get_allele_samples( + &site, site_offset, &mut_samples, allele_samples, &(num_alleles[site_idx])); site_offset += site.mutations_length + 1; - get_allele_samples(&site, &mut_samples, &out_row, &(num_alleles[site_idx])); } // if adding code below, check ret before continuing out: tsk_safe_free(nodes); tsk_tree_free(&tree); - tsk_bit_array_free(&mut_samples); - tsk_bit_array_free(&all_samples_bits); + tsk_bitset_free(&mut_samples); + tsk_bitset_free(&all_samples_bits); return ret == TSK_TREE_OK ? 0 : ret; } +// Given the samples under each allele's node and the sample sets, get the samples under +// each allele's node for each sample set. We pack this data into a bitset +// (`allele_sample_sets`) that is size m x n, where m is (n_alleles * num_sample_sets) +// and n is the size of the largest sample set. In addition, we compute the number of +// samples contained in the intersection of each allele's samples and each sample set in +// an array (`allele_sample_sets`) of length (n_alleles * num_sample_sets). +static void +get_mutation_sample_sets(const tsk_bitset_t *allele_samples, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + const tsk_id_t *sample_index_map, tsk_bitset_t *allele_sample_sets, + tsk_size_t *allele_sample_set_counts) +{ + tsk_bitset_val_t k, sample; + tsk_size_t i, j, ss_off; + + for (i = 0; i < allele_samples->len; i++) { + ss_off = 0; + for (j = 0; j < num_sample_sets; j++) { + for (k = 0; k < sample_set_sizes[j]; k++) { + sample = (tsk_bitset_val_t) sample_index_map[sample_sets[k + ss_off]]; + if (tsk_bitset_contains(allele_samples, i, sample)) { + tsk_bitset_set_bit(allele_sample_sets, j + i * num_sample_sets, k); + allele_sample_set_counts[j + i * num_sample_sets]++; + } + } + ss_off += sample_set_sizes[j]; + } + } +} + static int tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, - const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, norm_func_t *norm_f, tsk_size_t n_rows, const tsk_id_t *row_sites, tsk_size_t n_cols, const tsk_id_t *col_sites, tsk_flags_t options, double *result) { - int ret = 0; - tsk_bit_array_t allele_samples, c_state, r_state; - bool polarised = false; + tsk_bitset_t allele_samples, allele_sample_sets; + bool polarised = options & TSK_STAT_POLARISED; tsk_id_t *sites; - tsk_size_t r, c, s, n_alleles, n_sites, *row_idx, *col_idx; + tsk_size_t i, j, n_sites, *row_idx, *col_idx; double *result_row; const tsk_size_t num_samples = self->num_samples; - tsk_size_t *num_alleles = NULL, *site_offsets = NULL; + tsk_size_t *num_alleles = NULL, *site_offsets = NULL, *allele_counts = NULL; tsk_size_t result_row_len = n_cols * result_dim; + tsk_size_t max_ss_size = 0, max_alleles = 0, n_alleles = 0; + two_locus_work_t work; + tsk_memset(&work, 0, sizeof(work)); tsk_memset(&allele_samples, 0, sizeof(allele_samples)); - + tsk_memset(&allele_sample_sets, 0, sizeof(allele_sample_sets)); sites = tsk_malloc(self->tables->sites.num_rows * sizeof(*sites)); row_idx = tsk_malloc(self->tables->sites.num_rows * sizeof(*row_idx)); col_idx = tsk_malloc(self->tables->sites.num_rows * sizeof(*col_idx)); @@ -2635,44 +2681,67 @@ tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, } get_site_row_col_indices( n_rows, row_sites, n_cols, col_sites, sites, &n_sites, row_idx, col_idx); - - // We rely on n_sites to allocate these arrays, which are initialized - // to NULL for safe deallocation if the previous allocation fails + // depends on n_sites num_alleles = tsk_malloc(n_sites * sizeof(*num_alleles)); site_offsets = tsk_malloc(n_sites * sizeof(*site_offsets)); if (num_alleles == NULL || site_offsets == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } - - n_alleles = 0; - for (s = 0; s < n_sites; s++) { - site_offsets[s] = n_alleles; - n_alleles += self->site_mutations_length[sites[s]] + 1; + for (i = 0; i < n_sites; i++) { + site_offsets[i] = n_alleles * num_sample_sets; + n_alleles += self->site_mutations_length[sites[i]] + 1; + max_alleles = TSK_MAX(self->site_mutations_length[sites[i]], max_alleles); } - ret = tsk_bit_array_init(&allele_samples, num_samples, n_alleles); + max_alleles++; // add 1 for the ancestral allele + // depends on n_alleles + ret = tsk_bitset_init(&allele_samples, num_samples, n_alleles); if (ret != 0) { goto out; } - ret = get_mutation_samples(self, sites, n_sites, num_alleles, &allele_samples); + for (i = 0; i < num_sample_sets; i++) { + max_ss_size = TSK_MAX(sample_set_sizes[i], max_ss_size); + } + // depend on n_alleles and max_ss_size + ret = tsk_bitset_init(&allele_sample_sets, max_ss_size, n_alleles * num_sample_sets); if (ret != 0) { goto out; } - - if (options & TSK_STAT_POLARISED) { - polarised = true; + allele_counts = tsk_calloc(n_alleles * num_sample_sets, sizeof(*allele_counts)); + if (allele_counts == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; } - + // depends on max_ss_size and max_alleles + ret = two_locus_work_init(max_alleles, max_ss_size, result_dim, state_dim, &work); + if (ret != 0) { + goto out; + } + // we track the number of alleles to account for backmutations + ret = get_mutation_samples(self, sites, n_sites, num_alleles, &allele_samples); + if (ret != 0) { + goto out; + } + get_mutation_sample_sets(&allele_samples, num_sample_sets, sample_set_sizes, + sample_sets, self->sample_index_map, &allele_sample_sets, allele_counts); // For each row/column pair, fill in the sample set in the result matrix. - for (r = 0; r < n_rows; r++) { - result_row = GET_2D_ROW(result, result_row_len, r); - for (c = 0; c < n_cols; c++) { - tsk_bit_array_get_row(&allele_samples, site_offsets[row_idx[r]], &r_state); - tsk_bit_array_get_row(&allele_samples, site_offsets[col_idx[c]], &c_state); - ret = compute_general_two_site_stat_result(&r_state, &c_state, - num_alleles[row_idx[r]], num_alleles[col_idx[c]], num_samples, state_dim, - sample_sets, result_dim, f, f_params, norm_f, polarised, - &(result_row[c * result_dim])); + for (i = 0; i < n_rows; i++) { + result_row = GET_2D_ROW(result, result_row_len, i); + for (j = 0; j < n_cols; j++) { + if (num_alleles[row_idx[i]] == 2 && num_alleles[col_idx[j]] == 2) { + // both sites are biallelic + ret = compute_general_two_site_stat_result(&allele_sample_sets, + allele_counts, site_offsets[row_idx[i]], site_offsets[col_idx[j]], + state_dim, result_dim, f, f_params, &work, + &(result_row[j * result_dim])); + } else { + // at least one site is multiallelic + ret = compute_general_normed_two_site_stat_result(&allele_sample_sets, + allele_counts, site_offsets[row_idx[i]], site_offsets[col_idx[j]], + num_alleles[row_idx[i]], num_alleles[col_idx[j]], state_dim, + result_dim, f, f_params, norm_f, polarised, &work, + &(result_row[j * result_dim])); + } if (ret != 0) { goto out; } @@ -2685,37 +2754,37 @@ tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, tsk_safe_free(col_idx); tsk_safe_free(num_alleles); tsk_safe_free(site_offsets); - tsk_bit_array_free(&allele_samples); + tsk_safe_free(allele_counts); + two_locus_work_free(&work); + tsk_bitset_free(&allele_samples); + tsk_bitset_free(&allele_sample_sets); return ret; } static int -sample_sets_to_bit_array(const tsk_treeseq_t *self, const tsk_size_t *sample_set_sizes, +sample_sets_to_bitset(const tsk_treeseq_t *self, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_sample_sets, - tsk_bit_array_t *sample_sets_bits) + tsk_bitset_t *sample_sets_bits) { int ret; - tsk_bit_array_t bits_row; tsk_size_t j, k, l; tsk_id_t u, sample_index; - ret = tsk_bit_array_init(sample_sets_bits, self->num_samples, num_sample_sets); + ret = tsk_bitset_init(sample_sets_bits, self->num_samples, num_sample_sets); if (ret != 0) { return ret; } - j = 0; for (k = 0; k < num_sample_sets; k++) { - tsk_bit_array_get_row(sample_sets_bits, k, &bits_row); for (l = 0; l < sample_set_sizes[k]; l++) { u = sample_sets[j]; sample_index = self->sample_index_map[u]; - if (tsk_bit_array_contains( - &bits_row, (tsk_bit_array_value_t) sample_index)) { + if (tsk_bitset_contains( + sample_sets_bits, k, (tsk_bitset_val_t) sample_index)) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } - tsk_bit_array_add_bit(&bits_row, (tsk_bit_array_value_t) sample_index); + tsk_bitset_set_bit(sample_sets_bits, k, (tsk_bitset_val_t) sample_index); j++; } } @@ -2852,7 +2921,7 @@ get_index_counts( typedef struct { tsk_tree_t tree; - tsk_bit_array_t *node_samples; + tsk_bitset_t *node_samples; tsk_id_t *parent; tsk_id_t *edges_out; tsk_id_t *edges_in; @@ -2876,7 +2945,7 @@ iter_state_init(iter_state *self, const tsk_treeseq_t *ts, tsk_size_t state_dim) ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } - ret = tsk_bit_array_init(self->node_samples, ts->num_samples, state_dim * num_nodes); + ret = tsk_bitset_init(self->node_samples, ts->num_samples, state_dim * num_nodes); if (ret != 0) { goto out; } @@ -2895,29 +2964,25 @@ iter_state_init(iter_state *self, const tsk_treeseq_t *ts, tsk_size_t state_dim) static int get_node_samples(const tsk_treeseq_t *ts, tsk_size_t state_dim, - const tsk_bit_array_t *sample_sets, tsk_bit_array_t *node_samples) + const tsk_bitset_t *sample_sets, tsk_bitset_t *node_samples) { int ret = 0; tsk_size_t n, k; - tsk_bit_array_t sample_set_row, node_samples_row; tsk_size_t num_nodes = ts->tables->nodes.num_rows; - tsk_bit_array_value_t sample; + tsk_bitset_val_t sample; const tsk_id_t *restrict sample_index_map = ts->sample_index_map; const tsk_flags_t *restrict flags = ts->tables->nodes.flags; - ret = tsk_bit_array_init(node_samples, ts->num_samples, num_nodes * state_dim); + ret = tsk_bitset_init(node_samples, ts->num_samples, num_nodes * state_dim); if (ret != 0) { goto out; } for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row(sample_sets, k, &sample_set_row); for (n = 0; n < num_nodes; n++) { if (flags[n] & TSK_NODE_IS_SAMPLE) { - sample = (tsk_bit_array_value_t) sample_index_map[n]; - if (tsk_bit_array_contains(&sample_set_row, sample)) { - tsk_bit_array_get_row( - node_samples, (state_dim * n) + k, &node_samples_row); - tsk_bit_array_add_bit(&node_samples_row, sample); + sample = (tsk_bitset_val_t) sample_index_map[n]; + if (tsk_bitset_contains(sample_sets, k, sample)) { + tsk_bitset_set_bit(node_samples, (state_dim * n) + k, sample); } } } @@ -2928,7 +2993,7 @@ get_node_samples(const tsk_treeseq_t *ts, tsk_size_t state_dim, static void iter_state_clear(iter_state *self, tsk_size_t state_dim, tsk_size_t num_nodes, - const tsk_bit_array_t *node_samples) + const tsk_bitset_t *node_samples) { self->n_edges_out = 0; self->n_edges_in = 0; @@ -2938,14 +3003,14 @@ iter_state_clear(iter_state *self, tsk_size_t state_dim, tsk_size_t num_nodes, tsk_memset(self->edges_in, TSK_NULL, num_nodes * sizeof(*self->edges_in)); tsk_memset(self->branch_len, 0, num_nodes * sizeof(*self->branch_len)); tsk_memcpy(self->node_samples->data, node_samples->data, - node_samples->size * state_dim * num_nodes * sizeof(*node_samples->data)); + node_samples->row_len * state_dim * num_nodes * sizeof(*node_samples->data)); } static void iter_state_free(iter_state *self) { tsk_tree_free(&self->tree); - tsk_bit_array_free(self->node_samples); + tsk_bitset_free(self->node_samples); tsk_safe_free(self->node_samples); tsk_safe_free(self->parent); tsk_safe_free(self->edges_out); @@ -3025,41 +3090,26 @@ static int compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, const iter_state *A_state, const iter_state *B_state, tsk_size_t state_dim, tsk_size_t result_dim, int sign, general_stat_func_t *f, - sample_count_stat_params_t *f_params, double *result) + sample_count_stat_params_t *f_params, two_locus_work_t *restrict work, + double *result) { int ret = 0; double a_len, b_len; double *restrict B_branch_len = B_state->branch_len; - double *weights = NULL, *weights_row, *result_tmp = NULL; + double *weights_row; tsk_size_t n, k, a_row, b_row; - tsk_bit_array_t A_samples, B_samples, AB_samples, B_samples_tmp; const double *restrict A_branch_len = A_state->branch_len; - const tsk_bit_array_t *restrict A_state_samples = A_state->node_samples; - const tsk_bit_array_t *restrict B_state_samples = B_state->node_samples; - tsk_size_t num_samples = ts->num_samples; + const tsk_bitset_t *restrict A_state_samples = A_state->node_samples; + const tsk_bitset_t *restrict B_state_samples = B_state->node_samples; tsk_size_t num_nodes = ts->tables->nodes.num_rows; + double *weights = work->weights; + double *result_tmp = work->result_tmp; + tsk_bitset_t AB_samples = work->AB_samples; + b_len = B_branch_len[c] * sign; if (b_len == 0) { return ret; } - - tsk_memset(&AB_samples, 0, sizeof(AB_samples)); - tsk_memset(&B_samples_tmp, 0, sizeof(B_samples_tmp)); - - weights = tsk_calloc(3 * state_dim, sizeof(*weights)); - result_tmp = tsk_calloc(result_dim, sizeof(*result_tmp)); - if (weights == NULL || result_tmp == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } - ret = tsk_bit_array_init(&AB_samples, num_samples, 1); - if (ret != 0) { - goto out; - } - ret = tsk_bit_array_init(&B_samples_tmp, num_samples, 1); - if (ret != 0) { - goto out; - } for (n = 0; n < num_nodes; n++) { a_len = A_branch_len[n]; if (a_len == 0) { @@ -3068,15 +3118,14 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, for (k = 0; k < state_dim; k++) { a_row = (state_dim * n) + k; b_row = (state_dim * (tsk_size_t) c) + k; - tsk_bit_array_get_row(A_state_samples, a_row, &A_samples); - tsk_bit_array_get_row(B_state_samples, b_row, &B_samples); - tsk_bit_array_intersect(&A_samples, &B_samples, &AB_samples); weights_row = GET_2D_ROW(weights, 3, k); - weights_row[0] = (double) tsk_bit_array_count(&AB_samples); // w_AB + tsk_bitset_intersect( + A_state_samples, a_row, B_state_samples, b_row, &AB_samples); + weights_row[0] = (double) tsk_bitset_count(&AB_samples, 0); weights_row[1] - = (double) tsk_bit_array_count(&A_samples) - weights_row[0]; // w_Ab + = (double) tsk_bitset_count(A_state_samples, a_row) - weights_row[0]; weights_row[2] - = (double) tsk_bit_array_count(&B_samples) - weights_row[0]; // w_aB + = (double) tsk_bitset_count(B_state_samples, b_row) - weights_row[0]; } ret = f(state_dim, weights, result_dim, result_tmp, f_params); if (ret != 0) { @@ -3087,10 +3136,6 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, } } out: - tsk_safe_free(weights); - tsk_safe_free(result_tmp); - tsk_bit_array_free(&AB_samples); - tsk_bit_array_free(&B_samples_tmp); return ret; } @@ -3106,10 +3151,17 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, const tsk_id_t *restrict edges_child = ts->tables->edges.child; const tsk_id_t *restrict edges_parent = ts->tables->edges.parent; const tsk_size_t num_nodes = ts->tables->nodes.num_rows; - tsk_bit_array_t updates, row, ec_row, *r_samples = r_state->node_samples; + tsk_bitset_t updates, *r_samples = r_state->node_samples; + two_locus_work_t work; + tsk_memset(&work, 0, sizeof(work)); tsk_memset(&updates, 0, sizeof(updates)); - ret = tsk_bit_array_init(&updates, num_nodes, 1); + // only two alleles are possible for branch stats + ret = two_locus_work_init(2, ts->num_samples, result_dim, state_dim, &work); + if (ret != 0) { + goto out; + } + ret = tsk_bitset_init(&updates, num_nodes, 1); if (ret != 0) { goto out; } @@ -3126,18 +3178,18 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, c = edges_child[e]; // Identify affected nodes above child while (p != TSK_NULL) { - tsk_bit_array_add_bit(&updates, (tsk_bit_array_value_t) c); + tsk_bitset_set_bit(&updates, 0, (tsk_bitset_val_t) c); c = p; p = r_state->parent[p]; } } // Subtract the whole contribution from the child node - tsk_bit_array_get_items(&updates, updated_nodes, &n_updates); + tsk_bitset_get_items(&updates, 0, updated_nodes, &n_updates); while (n_updates != 0) { n_updates--; c = updated_nodes[n_updates]; - compute_two_tree_branch_state_update( - ts, c, l_state, r_state, state_dim, result_dim, -1, f, f_params, result); + compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, + result_dim, -1, f, f_params, &work, result); } // Remove samples under nodes from removed edges to parent nodes for (j = 0; j < r_state->n_edges_out; j++) { @@ -3146,10 +3198,8 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, ec = edges_child[e]; // edge child while (p != TSK_NULL) { for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row( - r_samples, (state_dim * (tsk_size_t) ec) + k, &ec_row); - tsk_bit_array_get_row(r_samples, (state_dim * (tsk_size_t) p) + k, &row); - tsk_bit_array_subtract(&row, &ec_row); + tsk_bitset_subtract(r_samples, (state_dim * (tsk_size_t) p) + k, + r_samples, (state_dim * (tsk_size_t) ec) + k); } p = r_state->parent[p]; } @@ -3164,12 +3214,10 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, r_state->branch_len[c] = time[p] - time[c]; r_state->parent[c] = p; while (p != TSK_NULL) { - tsk_bit_array_add_bit(&updates, (tsk_bit_array_value_t) c); + tsk_bitset_set_bit(&updates, 0, (tsk_bitset_val_t) c); for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row( - r_samples, (state_dim * (tsk_size_t) ec) + k, &ec_row); - tsk_bit_array_get_row(r_samples, (state_dim * (tsk_size_t) p) + k, &row); - tsk_bit_array_add(&row, &ec_row); + tsk_bitset_union(r_samples, (state_dim * (tsk_size_t) p) + k, r_samples, + (state_dim * (tsk_size_t) ec) + k); } c = p; p = r_state->parent[p]; @@ -3177,22 +3225,24 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, } // Update all affected child nodes (fully subtracted, deferred from addition) n_updates = 0; - tsk_bit_array_get_items(&updates, updated_nodes, &n_updates); + tsk_bitset_get_items(&updates, 0, updated_nodes, &n_updates); while (n_updates != 0) { n_updates--; c = updated_nodes[n_updates]; - compute_two_tree_branch_state_update( - ts, c, l_state, r_state, state_dim, result_dim, +1, f, f_params, result); + compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, + result_dim, +1, f, f_params, &work, result); } out: tsk_safe_free(updated_nodes); - tsk_bit_array_free(&updates); + two_locus_work_free(&work); + tsk_bitset_free(&updates); return ret; } static int tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, - const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, norm_func_t *TSK_UNUSED(norm_f), tsk_size_t n_rows, const double *row_positions, tsk_size_t n_cols, const double *col_positions, tsk_flags_t TSK_UNUSED(options), double *result) @@ -3201,11 +3251,12 @@ tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_di int r, c; tsk_id_t *row_indexes = NULL, *col_indexes = NULL; tsk_size_t i, j, k, row, col, *row_repeats = NULL, *col_repeats = NULL; - tsk_bit_array_t node_samples; + tsk_bitset_t node_samples, sample_sets_bits; iter_state l_state, r_state; double *result_tmp = NULL, *result_row; const tsk_size_t num_nodes = self->tables->nodes.num_rows; + tsk_memset(&sample_sets_bits, 0, sizeof(sample_sets_bits)); tsk_memset(&node_samples, 0, sizeof(node_samples)); tsk_memset(&l_state, 0, sizeof(l_state)); tsk_memset(&r_state, 0, sizeof(r_state)); @@ -3222,6 +3273,11 @@ tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_di if (ret != 0) { goto out; } + ret = sample_sets_to_bitset( + self, sample_set_sizes, sample_sets, num_sample_sets, &sample_sets_bits); + if (ret != 0) { + goto out; + } ret = positions_to_tree_indexes(self, row_positions, n_rows, &row_indexes); if (ret != 0) { goto out; @@ -3238,7 +3294,7 @@ tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_di if (ret != 0) { goto out; } - ret = get_node_samples(self, state_dim, sample_sets, &node_samples); + ret = get_node_samples(self, state_dim, &sample_sets_bits, &node_samples); if (ret != 0) { goto out; } @@ -3289,7 +3345,42 @@ tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_di tsk_safe_free(col_repeats); iter_state_free(&l_state); iter_state_free(&r_state); - tsk_bit_array_free(&node_samples); + tsk_bitset_free(&node_samples); + tsk_bitset_free(&sample_sets_bits); + return ret; +} + +static int +check_sample_set_dups(tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, const tsk_id_t *restrict sample_index_map, + tsk_size_t num_samples) +{ + int ret; + tsk_size_t j, k, l; + tsk_id_t u, sample_index; + tsk_bitset_t tmp; + + tsk_memset(&tmp, 0, sizeof(tmp)); + ret = tsk_bitset_init(&tmp, num_samples, 1); + if (ret != 0) { + goto out; + } + j = 0; + for (k = 0; k < num_sample_sets; k++) { + tsk_memset(tmp.data, 0, sizeof(*tmp.data) * tmp.row_len); + for (l = 0; l < sample_set_sizes[k]; l++) { + u = sample_sets[j]; + sample_index = sample_index_map[u]; + if (tsk_bitset_contains(&tmp, 0, (tsk_bitset_val_t) sample_index)) { + ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); + goto out; + } + tsk_bitset_set_bit(&tmp, 0, (tsk_bitset_val_t) sample_index); + j++; + } + } +out: + tsk_bitset_free(&tmp); return ret; } @@ -3304,7 +3395,6 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl // TODO: generalize this function if we ever decide to do weighted two_locus stats. // We only implement count stats and therefore we don't handle weights. int ret = 0; - tsk_bit_array_t sample_sets_bits; bool stat_site = !!(options & TSK_STAT_SITE); bool stat_branch = !!(options & TSK_STAT_BRANCH); tsk_size_t state_dim = num_sample_sets; @@ -3313,8 +3403,6 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl .sample_set_sizes = sample_set_sizes, .set_indexes = set_indexes }; - tsk_memset(&sample_sets_bits, 0, sizeof(sample_sets_bits)); - // We do not support two-locus node stats if (!!(options & TSK_STAT_NODE)) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_STAT_MODE); @@ -3338,12 +3426,6 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl ret = tsk_trace_error(TSK_ERR_BAD_RESULT_DIMS); goto out; } - ret = sample_sets_to_bit_array( - self, sample_set_sizes, sample_sets, num_sample_sets, &sample_sets_bits); - if (ret != 0) { - goto out; - } - if (stat_site) { ret = check_sites(row_sites, out_rows, self->tables->sites.num_rows); if (ret != 0) { @@ -3353,9 +3435,14 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl if (ret != 0) { goto out; } - ret = tsk_treeseq_two_site_count_stat(self, state_dim, &sample_sets_bits, - result_dim, f, &f_params, norm_f, out_rows, row_sites, out_cols, col_sites, - options, result); + ret = check_sample_set_dups(num_sample_sets, sample_set_sizes, sample_sets, + self->sample_index_map, self->num_samples); + if (ret != 0) { + goto out; + } + ret = tsk_treeseq_two_site_count_stat(self, state_dim, num_sample_sets, + sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows, + row_sites, out_cols, col_sites, options, result); } else if (stat_branch) { ret = check_positions( row_positions, out_rows, tsk_treeseq_get_sequence_length(self)); @@ -3367,13 +3454,11 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl if (ret != 0) { goto out; } - ret = tsk_treeseq_two_branch_count_stat(self, state_dim, &sample_sets_bits, - result_dim, f, &f_params, norm_f, out_rows, row_positions, out_cols, - col_positions, options, result); + ret = tsk_treeseq_two_branch_count_stat(self, state_dim, num_sample_sets, + sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows, + row_positions, out_cols, col_positions, options, result); } - out: - tsk_bit_array_free(&sample_sets_bits); return ret; } @@ -10687,6 +10772,9 @@ tsk_matvec_calculator_run(tsk_matvec_calculator_t *self) tsk_matvec_calculator_print_state(self, tsk_get_debug_stream()); } } + if (!!(self->options & TSK_STAT_SPAN_NORMALISE)) { + span_normalise(self->num_windows, windows, out_size, self->result); + } /* out: */ return ret; diff --git a/subprojects/tskit/tskit/trees.h b/subprojects/tskit/tskit/trees.h index 21495edbf..2e495a406 100644 --- a/subprojects/tskit/tskit/trees.h +++ b/subprojects/tskit/tskit/trees.h @@ -933,17 +933,20 @@ path from `p` to `c`. For instance, if `p` is the parent of `n` and `n` is the parent of `c`, then the span of the edges from `p` to `n` and `n` to `c` are extended, and the span of the edge from `p` to `c` is reduced. However, any edges whose child node is a sample are not -modified. The `node` of certain mutations may also be remapped; to do this +modified. See Fritze et al. (2025): +https://doi.org/10.1093/genetics/iyaf198 for more details. + +The method works by iterating over the genome to look for edges that can +be extended in this way; the maximum number of such iterations is +controlled by ``max_iter``. + +The `node` of certain mutations may also be remapped; to do this unambiguously we need to know mutation times. If mutations times are unknown, use `tsk_table_collection_compute_mutation_times` first. The method will not affect any tables except the edge table, or the node column in the mutation table. -The method works by iterating over the genome to look for edges that can -be extended in this way; the maximum number of such iterations is -controlled by ``max_iter``. - @rst **Options**: None currently defined. @@ -966,6 +969,52 @@ int tsk_treeseq_split_edges(const tsk_treeseq_t *self, double time, tsk_flags_t bool tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self); +/** +@brief Decode full-length alignments for specified nodes over an interval. + +@rst +Fills a caller-provided buffer with per-node sequence alignments for the interval +``[left, right)``. Each row is exactly ``L = right - left`` bytes with no trailing +terminator, and rows are tightly packed in row-major order in the output buffer. + +The output at non-site positions comes from the provided ``ref_seq`` slice +(``ref_seq[left:right]``); per-site alleles are overlaid onto this for each node. + +If the :c:macro:`TSK_ISOLATED_NOT_MISSING` option is +not set, nodes that are isolated (no parent and no children) within a tree +interval in ``[left, right)`` are rendered as the ``missing_data_character`` for +that interval. At site positions, decoded genotypes override any previous value; +if a genotype is missing (``TSK_MISSING_DATA``), the ``missing_data_character`` is +overlaid onto the reference base. + +Requirements and validation: + +- The tree sequence must have a discrete genome. +- ``left`` and ``right`` must be integers with ``0 <= left < right <= sequence_length``. +- ``ref_seq`` must be non-NULL and ``ref_seq_length == sequence_length``. +- Each allele at a site must be exactly one byte; alleles equal to + ``missing_data_character`` are not permitted. + +@endrst + +@param self A pointer to a :c:type:`tsk_treeseq_t` object. +@param ref_seq Pointer to a reference sequence buffer of length ``ref_seq_length``. +@param ref_seq_length The total length of ``ref_seq``; must equal the tree sequence +length. +@param nodes Array of node IDs to decode (may include non-samples). +@param num_nodes The number of nodes in ``nodes`` and rows in the output. +@param left The inclusive-left genomic coordinate of the output interval. +@param right The exclusive-right genomic coordinate of the output interval. +@param missing_data_character The byte to use for missing data. +@param alignments_out Output buffer of size at least ``num_nodes * (right - left)``. +@param options Bitwise option flags; supports :c:macro:`TSK_ISOLATED_NOT_MISSING`. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_treeseq_decode_alignments(const tsk_treeseq_t *self, const char *ref_seq, + tsk_size_t ref_seq_length, const tsk_id_t *nodes, tsk_size_t num_nodes, double left, + double right, char missing_data_character, char *alignments_out, + tsk_flags_t options); + int tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output); int tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output);