diff --git a/harmony_model_checker/charm/charm.c b/harmony_model_checker/charm/charm.c
index 633f225f..68ada3f2 100644
--- a/harmony_model_checker/charm/charm.c
+++ b/harmony_model_checker/charm/charm.c
@@ -2654,61 +2654,102 @@ static void do_work_b(struct worker *w){
     }
 }
 
+static inline void do_work2_single_state(struct worker *w, struct shard *shard, struct state_header *sh){
+
+    struct state *sc = (struct state *) &sh[1];
+    unsigned int size = state_size(sc);
+
+    // See if this state has been computed before by looking up the node,
+    // or allocate if not.
+    bool new;
+    struct dict_assoc *hn = sdict_find_new(shard->states, &w->allocator,
+        sc, size, sh->noutgoing * sizeof(struct edge), &new, sh->hash);
+        // sc, size, sh->noutgoing * sizeof(struct edge), &new, meiyan3(sc));
+    struct node *next = (struct node *) &hn[1];
+    struct edge *edge = &node_edges(sh->node)[sh->edge_index];
+    edge->dst = next;
+
+    if (new) {
+        next->failed = edge->failed;
+        next->initial = false;
+        next->parent = sh->node;
+        next->len = sh->node->len + 1;
+        if (next->len > w->diameter) {
+            w->diameter = next->len;
+        }
+        next->nedges = sh->noutgoing;
+
+        assert(shard->tb_tail->nresults == shard->tb_size % NRESULTS);
+        assert(shard->tb_tail->next == NULL);
+        shard->tb_size++;
+        shard->tb_tail->results[shard->tb_tail->nresults++] = next;
+        if (shard->tb_tail->nresults == NRESULTS) {
+            struct results_block *rb = walloc_fast(w, sizeof(*shard->tb_tail));
+            rb->nresults = 0;
+            rb->next = NULL;
+            shard->tb_tail->next = rb;
+            shard->tb_tail = rb;
+        }
+        assert(shard->tb_tail->nresults == shard->tb_size % NRESULTS);
+    }
+
+    // See if the node points sideways or backwards, in which
+    // case cycles in the graph are possible
+    else {
+        w->miss++;
+        if (next != sh->node && next->len <= sh->node->len) {
+            w->cycles_possible = true;
+        }
+    }
+}
+
 static void do_work2(struct worker *w){
     struct shard *shard = &w->shard; 
 
     // printf("WORK 2: %u: %u %lu\n", w->index, shard->sb_index, sizeof(shard->state_buffer));
 
+    int pre_fetch_buf_size = 20;
+    struct state_header **pre_fetch_buf = malloc(sizeof(struct state_header *) * pre_fetch_buf_size);
+    int pre_fetch_buf_cnt = 0;
+
     for (unsigned int i = 0; i < w->nworkers; i++) {
         struct shard *s2 = &global.workers[i].shard;
         for (struct state_header *sh = s2->peers[w->index].first;
                                     sh != NULL; sh = sh->next) {
-            struct state *sc = (struct state *) &sh[1];
-            unsigned int size = state_size(sc);
+            
+            if (pre_fetch_buf_cnt >= pre_fetch_buf_size) {
 
-            // See if this state has been computed before by looking up the node,
-            // or allocate if not.
-            bool new;
-            struct dict_assoc *hn = sdict_find_new(shard->states, &w->allocator,
-                sc, size, sh->noutgoing * sizeof(struct edge), &new, sh->hash);
-                // sc, size, sh->noutgoing * sizeof(struct edge), &new, meiyan3(sc));
-            struct node *next = (struct node *) &hn[1];
-            struct edge *edge = &node_edges(sh->node)[sh->edge_index];
-            edge->dst = next;
+                struct state_header *tmp_sh = sh;
 
-            if (new) {
-                next->failed = edge->failed;
-                next->initial = false;
-                next->parent = sh->node;
-                next->len = sh->node->len + 1;
-                if (next->len > w->diameter) {
-                    w->diameter = next->len;
+                for (int i = 0; i < pre_fetch_buf_cnt; i++) {
+                    sh = pre_fetch_buf[i];
+                    sdict_prefetch_beginning_assoc(shard->states, sh->hash);
                 }
-                next->nedges = sh->noutgoing;
-
-                assert(shard->tb_tail->nresults == shard->tb_size % NRESULTS);
-                assert(shard->tb_tail->next == NULL);
-                shard->tb_size++;
-                shard->tb_tail->results[shard->tb_tail->nresults++] = next;
-                if (shard->tb_tail->nresults == NRESULTS) {
-                    struct results_block *rb = walloc_fast(w, sizeof(*shard->tb_tail));
-                    rb->nresults = 0;
-                    rb->next = NULL;
-                    shard->tb_tail->next = rb;
-                    shard->tb_tail = rb;
+                for (int i = 0; i < pre_fetch_buf_cnt; i++) {
+                    sh = pre_fetch_buf[i];
+                    do_work2_single_state(w, shard, sh);
                 }
-                assert(shard->tb_tail->nresults == shard->tb_size % NRESULTS);
-            }
+                sh = tmp_sh;
 
-            // See if the node points sideways or backwards, in which
-            // case cycles in the graph are possible
+                pre_fetch_buf_cnt = 0;
+                pre_fetch_buf[pre_fetch_buf_cnt++] = sh;
+                sdict_prefetch_base_ptr(shard->states, sh->hash);
+            }
             else {
-                w->miss++;
-                if (next != sh->node && next->len <= sh->node->len) {
-                    w->cycles_possible = true;
-                }
+                pre_fetch_buf[pre_fetch_buf_cnt++] = sh;
+                sdict_prefetch_base_ptr(shard->states, sh->hash);
             }
         }
+
+        for (int i = 0; i < pre_fetch_buf_cnt; i++) {
+            struct state_header *sh = pre_fetch_buf[i];
+            sdict_prefetch_beginning_assoc(shard->states, sh->hash);
+        }
+
+        for (int i = 0; i < pre_fetch_buf_cnt; i++) {
+            struct state_header *sh = pre_fetch_buf[i];
+            do_work2_single_state(w, shard, sh);
+        }
     }
 
     assert(shard->tb_index <= shard->tb_size);
diff --git a/harmony_model_checker/charm/fastrange.h b/harmony_model_checker/charm/fastrange.h
new file mode 100644
index 00000000..5d6281e8
--- /dev/null
+++ b/harmony_model_checker/charm/fastrange.h
@@ -0,0 +1,88 @@
+/**
+* Fair maps to intervals without division.
+* Reference : http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+*
+* (c) Daniel Lemire
+* Apache License 2.0
+*/
+#ifndef INCLUDE_FASTRANGE_H
+#define INCLUDE_FASTRANGE_H
+#include <iso646.h> // mostly for Microsoft compilers
+#include <stdint.h> // part of Visual Studio 2010 and better
+#include <stddef.h> // for size_t in C
+#include <limits.h> // for size_t in C
+
+/**
+* Given a value "word", produces an integer in [0,p) without division.
+* The function is as fair as possible in the sense that if you iterate
+* through all possible values of "word", then you will generate all
+* possible outputs as uniformly as possible.
+*/
+static inline uint32_t fastrange32(uint32_t word, uint32_t p) {
+	return (uint32_t)(((uint64_t)word * (uint64_t)p) >> 32);
+}
+
+#if defined(_MSC_VER) && defined (_WIN64)
+#include <intrin.h>// should be part of all recent Visual Studio
+#pragma intrinsic(_umul128)
+#endif // defined(_MSC_VER) && defined (_WIN64)
+
+
+/**
+* Given a value "word", produces an integer in [0,p) without division.
+* The function is as fair as possible in the sense that if you iterate
+* through all possible values of "word", then you will generate all
+* possible outputs as uniformly as possible.
+*/
+static inline uint64_t fastrange64(uint64_t word, uint64_t p) {
+#ifdef __SIZEOF_INT128__ // then we know we have a 128-bit int
+	return (uint64_t)(((__uint128_t)word * (__uint128_t)p) >> 64);
+#elif defined(_MSC_VER) && defined(_WIN64)
+	// supported in Visual Studio 2005 and better
+	uint64_t highProduct;
+	_umul128(word, p, &highProduct); // ignore output
+	return highProduct;
+	unsigned __int64 _umul128(
+		unsigned __int64 Multiplier,
+		unsigned __int64 Multiplicand,
+		unsigned __int64 *HighProduct
+	);
+#else
+	return word % p; // fallback
+#endif // __SIZEOF_INT128__
+}
+
+
+#ifndef UINT32_MAX
+#define UINT32_MAX  (0xffffffff)
+#endif // UINT32_MAX
+
+/**
+* Given a value "word", produces an integer in [0,p) without division.
+* The function is as fair as possible in the sense that if you iterate
+* through all possible values of "word", then you will generate all
+* possible outputs as uniformly as possible.
+*/
+static inline size_t fastrangesize(size_t word, size_t p) {
+#if (SIZE_MAX == UINT32_MAX)
+	return (size_t)fastrange32(word, p);
+#else // assume 64-bit
+	return (size_t)fastrange64(word, p);
+#endif // SIZE_MAX == UINT32_MAX
+}
+
+/**
+* Given a value "word", produces an integer in [0,p) without division.
+* The function is as fair as possible in the sense that if you iterate
+* through all possible values of "word", then you will generate all
+* possible outputs as uniformly as possible.
+*/
+static inline int fastrangeint(int word, int p) {
+#if (SIZE_MAX == UINT32_MAX)
+	return (int)fastrange32(word, p);
+#else // assume 64-bit
+	return (int)fastrange64(word, p);
+#endif // (SIZE_MAX == UINT32_MAX)
+}
+
+#endif// INCLUDE_FASTRANGE_H
\ No newline at end of file
diff --git a/harmony_model_checker/charm/prefetcher.h b/harmony_model_checker/charm/prefetcher.h
new file mode 100644
index 00000000..9d844795
--- /dev/null
+++ b/harmony_model_checker/charm/prefetcher.h
@@ -0,0 +1,15 @@
+#ifndef PREFETCHER_H
+#define PREFETCHER_H
+
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define PREFETCH(addr, rw, locality) __builtin_prefetch((addr), (rw), (locality))
+#elif defined(_WIN32)
+    #include <xmmintrin.h>
+    #define PREFETCH(addr, rw, locality) _mm_prefetch((const char*)(addr), _MM_HINT_T0)
+#else
+    #define PREFETCH(addr, rw, locality) // No prefetch available
+#endif
+
+#endif // PREFETCHER_H
\ No newline at end of file
diff --git a/harmony_model_checker/charm/sdict.c b/harmony_model_checker/charm/sdict.c
index 8b6520fc..3d94240c 100644
--- a/harmony_model_checker/charm/sdict.c
+++ b/harmony_model_checker/charm/sdict.c
@@ -1,3 +1,4 @@
+#include "fastrange.h"
 #include "head.h"
 
 #include <assert.h>
diff --git a/harmony_model_checker/charm/sdict.h b/harmony_model_checker/charm/sdict.h
index 4f13aac5..58cd973a 100644
--- a/harmony_model_checker/charm/sdict.h
+++ b/harmony_model_checker/charm/sdict.h
@@ -7,9 +7,18 @@
 #include <string.h> /* memcpy/memcmp */
 #include <stdbool.h>
 
+#include "fastrange.h"
+#include "prefetcher.h"
 #include "hashdict.h"
 #include "thread.h"
 
+
+#define sdict_prefetch_base_ptr(dict, hash) \
+    PREFETCH((dict)->stable + (hash) % (dict)->length, 1, 1)
+
+#define sdict_prefetch_beginning_assoc(dict, hash) \
+    PREFETCH(&(dict)->stable[(hash) % (dict)->length], 1, 1)
+
 struct sdict {
     char *whoami;
     unsigned int value_len;