diff --git a/harmony_model_checker/charm/charm.c b/harmony_model_checker/charm/charm.c index 633f225f..68ada3f2 100644 --- a/harmony_model_checker/charm/charm.c +++ b/harmony_model_checker/charm/charm.c @@ -2654,61 +2654,102 @@ static void do_work_b(struct worker *w){ } } +static inline void do_work2_single_state(struct worker *w, struct shard *shard, struct state_header *sh){ + + struct state *sc = (struct state *) &sh[1]; + unsigned int size = state_size(sc); + + // See if this state has been computed before by looking up the node, + // or allocate if not. + bool new; + struct dict_assoc *hn = sdict_find_new(shard->states, &w->allocator, + sc, size, sh->noutgoing * sizeof(struct edge), &new, sh->hash); + // sc, size, sh->noutgoing * sizeof(struct edge), &new, meiyan3(sc)); + struct node *next = (struct node *) &hn[1]; + struct edge *edge = &node_edges(sh->node)[sh->edge_index]; + edge->dst = next; + + if (new) { + next->failed = edge->failed; + next->initial = false; + next->parent = sh->node; + next->len = sh->node->len + 1; + if (next->len > w->diameter) { + w->diameter = next->len; + } + next->nedges = sh->noutgoing; + + assert(shard->tb_tail->nresults == shard->tb_size % NRESULTS); + assert(shard->tb_tail->next == NULL); + shard->tb_size++; + shard->tb_tail->results[shard->tb_tail->nresults++] = next; + if (shard->tb_tail->nresults == NRESULTS) { + struct results_block *rb = walloc_fast(w, sizeof(*shard->tb_tail)); + rb->nresults = 0; + rb->next = NULL; + shard->tb_tail->next = rb; + shard->tb_tail = rb; + } + assert(shard->tb_tail->nresults == shard->tb_size % NRESULTS); + } + + // See if the node points sideways or backwards, in which + // case cycles in the graph are possible + else { + w->miss++; + if (next != sh->node && next->len <= sh->node->len) { + w->cycles_possible = true; + } + } +} + static void do_work2(struct worker *w){ struct shard *shard = &w->shard; // printf("WORK 2: %u: %u %lu\n", w->index, shard->sb_index, sizeof(shard->state_buffer)); + int pre_fetch_buf_size = 20; + struct state_header **pre_fetch_buf = malloc(sizeof(struct state_header *) * pre_fetch_buf_size); + int pre_fetch_buf_cnt = 0; + for (unsigned int i = 0; i < w->nworkers; i++) { struct shard *s2 = &global.workers[i].shard; for (struct state_header *sh = s2->peers[w->index].first; sh != NULL; sh = sh->next) { - struct state *sc = (struct state *) &sh[1]; - unsigned int size = state_size(sc); + + if (pre_fetch_buf_cnt >= pre_fetch_buf_size) { - // See if this state has been computed before by looking up the node, - // or allocate if not. - bool new; - struct dict_assoc *hn = sdict_find_new(shard->states, &w->allocator, - sc, size, sh->noutgoing * sizeof(struct edge), &new, sh->hash); - // sc, size, sh->noutgoing * sizeof(struct edge), &new, meiyan3(sc)); - struct node *next = (struct node *) &hn[1]; - struct edge *edge = &node_edges(sh->node)[sh->edge_index]; - edge->dst = next; + struct state_header *tmp_sh = sh; - if (new) { - next->failed = edge->failed; - next->initial = false; - next->parent = sh->node; - next->len = sh->node->len + 1; - if (next->len > w->diameter) { - w->diameter = next->len; + for (int i = 0; i < pre_fetch_buf_cnt; i++) { + sh = pre_fetch_buf[i]; + sdict_prefetch_beginning_assoc(shard->states, sh->hash); } - next->nedges = sh->noutgoing; - - assert(shard->tb_tail->nresults == shard->tb_size % NRESULTS); - assert(shard->tb_tail->next == NULL); - shard->tb_size++; - shard->tb_tail->results[shard->tb_tail->nresults++] = next; - if (shard->tb_tail->nresults == NRESULTS) { - struct results_block *rb = walloc_fast(w, sizeof(*shard->tb_tail)); - rb->nresults = 0; - rb->next = NULL; - shard->tb_tail->next = rb; - shard->tb_tail = rb; + for (int i = 0; i < pre_fetch_buf_cnt; i++) { + sh = pre_fetch_buf[i]; + do_work2_single_state(w, shard, sh); } - assert(shard->tb_tail->nresults == shard->tb_size % NRESULTS); - } + sh = tmp_sh; - // See if the node points sideways or backwards, in which - // case cycles in the graph are possible + pre_fetch_buf_cnt = 0; + pre_fetch_buf[pre_fetch_buf_cnt++] = sh; + sdict_prefetch_base_ptr(shard->states, sh->hash); + } else { - w->miss++; - if (next != sh->node && next->len <= sh->node->len) { - w->cycles_possible = true; - } + pre_fetch_buf[pre_fetch_buf_cnt++] = sh; + sdict_prefetch_base_ptr(shard->states, sh->hash); } } + + for (int i = 0; i < pre_fetch_buf_cnt; i++) { + struct state_header *sh = pre_fetch_buf[i]; + sdict_prefetch_beginning_assoc(shard->states, sh->hash); + } + + for (int i = 0; i < pre_fetch_buf_cnt; i++) { + struct state_header *sh = pre_fetch_buf[i]; + do_work2_single_state(w, shard, sh); + } } assert(shard->tb_index <= shard->tb_size); diff --git a/harmony_model_checker/charm/fastrange.h b/harmony_model_checker/charm/fastrange.h new file mode 100644 index 00000000..5d6281e8 --- /dev/null +++ b/harmony_model_checker/charm/fastrange.h @@ -0,0 +1,88 @@ +/** +* Fair maps to intervals without division. +* Reference : http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ +* +* (c) Daniel Lemire +* Apache License 2.0 +*/ +#ifndef INCLUDE_FASTRANGE_H +#define INCLUDE_FASTRANGE_H +#include // mostly for Microsoft compilers +#include // part of Visual Studio 2010 and better +#include // for size_t in C +#include // for size_t in C + +/** +* Given a value "word", produces an integer in [0,p) without division. +* The function is as fair as possible in the sense that if you iterate +* through all possible values of "word", then you will generate all +* possible outputs as uniformly as possible. +*/ +static inline uint32_t fastrange32(uint32_t word, uint32_t p) { + return (uint32_t)(((uint64_t)word * (uint64_t)p) >> 32); +} + +#if defined(_MSC_VER) && defined (_WIN64) +#include // should be part of all recent Visual Studio +#pragma intrinsic(_umul128) +#endif // defined(_MSC_VER) && defined (_WIN64) + + +/** +* Given a value "word", produces an integer in [0,p) without division. +* The function is as fair as possible in the sense that if you iterate +* through all possible values of "word", then you will generate all +* possible outputs as uniformly as possible. +*/ +static inline uint64_t fastrange64(uint64_t word, uint64_t p) { +#ifdef __SIZEOF_INT128__ // then we know we have a 128-bit int + return (uint64_t)(((__uint128_t)word * (__uint128_t)p) >> 64); +#elif defined(_MSC_VER) && defined(_WIN64) + // supported in Visual Studio 2005 and better + uint64_t highProduct; + _umul128(word, p, &highProduct); // ignore output + return highProduct; + unsigned __int64 _umul128( + unsigned __int64 Multiplier, + unsigned __int64 Multiplicand, + unsigned __int64 *HighProduct + ); +#else + return word % p; // fallback +#endif // __SIZEOF_INT128__ +} + + +#ifndef UINT32_MAX +#define UINT32_MAX (0xffffffff) +#endif // UINT32_MAX + +/** +* Given a value "word", produces an integer in [0,p) without division. +* The function is as fair as possible in the sense that if you iterate +* through all possible values of "word", then you will generate all +* possible outputs as uniformly as possible. +*/ +static inline size_t fastrangesize(size_t word, size_t p) { +#if (SIZE_MAX == UINT32_MAX) + return (size_t)fastrange32(word, p); +#else // assume 64-bit + return (size_t)fastrange64(word, p); +#endif // SIZE_MAX == UINT32_MAX +} + +/** +* Given a value "word", produces an integer in [0,p) without division. +* The function is as fair as possible in the sense that if you iterate +* through all possible values of "word", then you will generate all +* possible outputs as uniformly as possible. +*/ +static inline int fastrangeint(int word, int p) { +#if (SIZE_MAX == UINT32_MAX) + return (int)fastrange32(word, p); +#else // assume 64-bit + return (int)fastrange64(word, p); +#endif // (SIZE_MAX == UINT32_MAX) +} + +#endif// INCLUDE_FASTRANGE_H \ No newline at end of file diff --git a/harmony_model_checker/charm/prefetcher.h b/harmony_model_checker/charm/prefetcher.h new file mode 100644 index 00000000..9d844795 --- /dev/null +++ b/harmony_model_checker/charm/prefetcher.h @@ -0,0 +1,15 @@ +#ifndef PREFETCHER_H +#define PREFETCHER_H + +#include + +#if defined(__GNUC__) || defined(__clang__) + #define PREFETCH(addr, rw, locality) __builtin_prefetch((addr), (rw), (locality)) +#elif defined(_WIN32) + #include + #define PREFETCH(addr, rw, locality) _mm_prefetch((const char*)(addr), _MM_HINT_T0) +#else + #define PREFETCH(addr, rw, locality) // No prefetch available +#endif + +#endif // PREFETCHER_H \ No newline at end of file diff --git a/harmony_model_checker/charm/sdict.c b/harmony_model_checker/charm/sdict.c index 8b6520fc..3d94240c 100644 --- a/harmony_model_checker/charm/sdict.c +++ b/harmony_model_checker/charm/sdict.c @@ -1,3 +1,4 @@ +#include "fastrange.h" #include "head.h" #include diff --git a/harmony_model_checker/charm/sdict.h b/harmony_model_checker/charm/sdict.h index 4f13aac5..58cd973a 100644 --- a/harmony_model_checker/charm/sdict.h +++ b/harmony_model_checker/charm/sdict.h @@ -7,9 +7,18 @@ #include /* memcpy/memcmp */ #include +#include "fastrange.h" +#include "prefetcher.h" #include "hashdict.h" #include "thread.h" + +#define sdict_prefetch_base_ptr(dict, hash) \ + PREFETCH((dict)->stable + (hash) % (dict)->length, 1, 1) + +#define sdict_prefetch_beginning_assoc(dict, hash) \ + PREFETCH(&(dict)->stable[(hash) % (dict)->length], 1, 1) + struct sdict { char *whoami; unsigned int value_len;