From 8d2f7d4e729655adb5fcfe799377912cfb43a032 Mon Sep 17 00:00:00 2001
From: Shamshura Egor <shamshuraegor@gmail.com>
Date: Thu, 23 Oct 2025 12:30:27 +0000
Subject: [PATCH 1/6] Add benchmarks

---
 CMakeLists.txt                          |  37 ++++-
 benchmarks/CMakeLists.txt               |  22 +++
 benchmarks/common/CMakeLists.txt        |   2 +
 benchmarks/common/include/bench/utils.h |  21 +++
 benchmarks/common/start.s               |  39 +++++
 benchmarks/fibrec/CMakeLists.txt        |   2 +
 benchmarks/fibrec/fibrec.c              |  31 ++++
 benchmarks/median/CMakeLists.txt        |   1 +
 benchmarks/median/median.c              |  42 +++++
 benchmarks/median/median.h              |  11 ++
 benchmarks/median/median_gendata.pl     | 140 ++++++++++++++++
 benchmarks/median/median_main.c         |  36 +++++
 benchmarks/multiply/CMakeLists.txt      |   1 +
 benchmarks/multiply/multiply.c          |  19 +++
 benchmarks/multiply/multiply.h          |  11 ++
 benchmarks/multiply/multiply_gendata.pl | 142 ++++++++++++++++
 benchmarks/multiply/multiply_main.c     |  39 +++++
 benchmarks/nopbench/CMakeLists.txt      |   4 +
 benchmarks/nopbench/nopbench.s.in       |   9 ++
 benchmarks/qsort/CMakeLists.txt         |   1 +
 benchmarks/qsort/qsort.c                | 154 ++++++++++++++++++
 benchmarks/qsort/qsort_gendata.pl       | 132 +++++++++++++++
 benchmarks/rsort/CMakeLists.txt         |   1 +
 benchmarks/rsort/qsort_gendata.pl       | 132 +++++++++++++++
 benchmarks/rsort/rsort.c                | 117 ++++++++++++++
 benchmarks/towers/CMakeLists.txt        |   1 +
 benchmarks/towers/towers_main.c         | 207 ++++++++++++++++++++++++
 benchmarks/vvadd/CMakeLists.txt         |   1 +
 benchmarks/vvadd/vvadd_gendata.pl       | 139 ++++++++++++++++
 benchmarks/vvadd/vvadd_main.c           |  44 +++++
 cmake/dependencies.cmake                |   6 +-
 cmake/toolchain/riscv.cmake             |  65 ++++++++
 src/hart/include/prot/hart.hh           |   2 +
 src/jit/lightning/CMakeLists.txt        |   6 +-
 src/jit/llvm/llvmbasedjit.cc            |  16 +-
 src/jit/mir/CMakeLists.txt              |   1 +
 tools/bench/Benchmark.py                |  98 +++++++++++
 tools/bench/DoBenchmark.py              | 135 ++++++++++++++++
 tools/bench/requirements.txt            |   4 +
 tools/sim/sim_app.cpp                   |  10 +-
 40 files changed, 1861 insertions(+), 20 deletions(-)
 create mode 100644 benchmarks/CMakeLists.txt
 create mode 100644 benchmarks/common/CMakeLists.txt
 create mode 100644 benchmarks/common/include/bench/utils.h
 create mode 100644 benchmarks/common/start.s
 create mode 100644 benchmarks/fibrec/CMakeLists.txt
 create mode 100644 benchmarks/fibrec/fibrec.c
 create mode 100644 benchmarks/median/CMakeLists.txt
 create mode 100644 benchmarks/median/median.c
 create mode 100644 benchmarks/median/median.h
 create mode 100755 benchmarks/median/median_gendata.pl
 create mode 100644 benchmarks/median/median_main.c
 create mode 100644 benchmarks/multiply/CMakeLists.txt
 create mode 100644 benchmarks/multiply/multiply.c
 create mode 100644 benchmarks/multiply/multiply.h
 create mode 100755 benchmarks/multiply/multiply_gendata.pl
 create mode 100644 benchmarks/multiply/multiply_main.c
 create mode 100644 benchmarks/nopbench/CMakeLists.txt
 create mode 100644 benchmarks/nopbench/nopbench.s.in
 create mode 100644 benchmarks/qsort/CMakeLists.txt
 create mode 100644 benchmarks/qsort/qsort.c
 create mode 100755 benchmarks/qsort/qsort_gendata.pl
 create mode 100644 benchmarks/rsort/CMakeLists.txt
 create mode 100755 benchmarks/rsort/qsort_gendata.pl
 create mode 100644 benchmarks/rsort/rsort.c
 create mode 100644 benchmarks/towers/CMakeLists.txt
 create mode 100644 benchmarks/towers/towers_main.c
 create mode 100644 benchmarks/vvadd/CMakeLists.txt
 create mode 100755 benchmarks/vvadd/vvadd_gendata.pl
 create mode 100644 benchmarks/vvadd/vvadd_main.c
 create mode 100644 cmake/toolchain/riscv.cmake
 create mode 100644 tools/bench/Benchmark.py
 create mode 100644 tools/bench/DoBenchmark.py
 create mode 100644 tools/bench/requirements.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7cd83a8..3bc98a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,21 +1,42 @@
 cmake_minimum_required(VERSION 3.22 FATAL_ERROR)
 
-project(prot_jit)
+project(prot_jit LANGUAGES C CXX)
 
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-  set_property(CACHE CMAKE_INSTALL_PREFIX PROPERTY VALUE "${CMAKE_BINARY_DIR}/install")
+  set_property(CACHE CMAKE_INSTALL_PREFIX
+               PROPERTY VALUE "${CMAKE_BINARY_DIR}/install")
 endif()
 
 option(PROT_ENABLE_WERROR "Enable -Werror option (CI)" OFF)
-
+option(PROT_BUILD_BENCHMARKS
+       "Enable benchmarks build (requires riscv gnu toolchain)" OFF)
 enable_testing()
 
-include(cmake/CPM.cmake)
-# Provide default CXX settings
-include(cmake/defaults.cmake)
-include(cmake/dependencies.cmake)
-include(cmake/utils.cmake)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
+include(CPM)
+# Provide default CXX settings
+include(defaults)
+include(dependencies)
+include(utils)
 
 add_subdirectory(src)
 add_subdirectory(tools)
+
+if(PROT_BUILD_BENCHMARKS)
+  set(RISCV_TOOLCHAIN ${CMAKE_CURRENT_SOURCE_DIR}/cmake/toolchain/riscv.cmake)
+  set(DEFAULT_ARGS
+      -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX}
+      -DCMAKE_BUILD_TYPE:STRING=Release
+      -DCMAKE_TOOLCHAIN_FILE:PATH=${RISCV_TOOLCHAIN})
+  if (DEFINED RISCV_TOOLCHAIN_DIR)
+    list(APPEND DEFAULT_ARGS -DRISCV_TOOLCHAIN_DIR:PATH=${RISCV_TOOLCHAIN_DIR})
+  endif()
+  ExternalProject_Add(
+    benchmarks
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks
+    BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/benchmarks
+    CONFIGURE_HANDLED_BY_BUILD True
+    BUILD_ALWAYS True
+    CMAKE_ARGS ${DEFAULT_ARGS})
+endif()
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..4b8200d
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.22 FATAL_ERROR)
+project(psim-benchmarks LANGUAGES C ASM)
+
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set_property(CACHE CMAKE_INSTALL_PREFIX PROPERTY VALUE "${CMAKE_BINARY_DIR}/install")
+endif()
+
+macro(prot_add_bench tar)
+  add_executable(${tar} ${ARGN})
+  target_link_libraries(${tar} PRIVATE bench_startup)
+  install(TARGETS ${tar} DESTINATION bench)
+endmacro()
+
+add_subdirectory(common)
+add_subdirectory(qsort)
+add_subdirectory(towers)
+add_subdirectory(multiply)
+add_subdirectory(rsort)
+add_subdirectory(vvadd)
+add_subdirectory(median)
+add_subdirectory(nopbench)
+add_subdirectory(fibrec)
diff --git a/benchmarks/common/CMakeLists.txt b/benchmarks/common/CMakeLists.txt
new file mode 100644
index 0000000..811c2c5
--- /dev/null
+++ b/benchmarks/common/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_library(bench_startup OBJECT start.s)
+target_include_directories(bench_startup PUBLIC include)
diff --git a/benchmarks/common/include/bench/utils.h b/benchmarks/common/include/bench/utils.h
new file mode 100644
index 0000000..f4a320d
--- /dev/null
+++ b/benchmarks/common/include/bench/utils.h
@@ -0,0 +1,21 @@
+#ifndef UTILS_H_INCLUDED
+#define UTILS_H_INCLUDED
+
+
+static int verify(int n, const int *test, const int *verify) {
+  int i;
+  // Unrolled for faster verification
+  for (i = 0; i < n / 2 * 2; i += 2) {
+    int t0 = test[i], t1 = test[i + 1];
+    int v0 = verify[i], v1 = verify[i + 1];
+    if (t0 != v0)
+      return i + 1;
+    if (t1 != v1)
+      return i + 2;
+  }
+  if (n % 2 != 0 && test[n - 1] != verify[n - 1])
+    return n;
+  return 0;
+}
+
+#endif // UTILS_H_INCLUDED
diff --git a/benchmarks/common/start.s b/benchmarks/common/start.s
new file mode 100644
index 0000000..a9b0814
--- /dev/null
+++ b/benchmarks/common/start.s
@@ -0,0 +1,39 @@
+.global _start
+.section .text
+_start:
+  li  x1, 0
+  # li  x2, 0 set via sim
+  li  x3, 0
+  li  x4, 0
+  li  x5, 0
+  li  x6, 0
+  li  x7, 0
+  li  x8, 0
+  li  x9, 0
+  li  x10,0
+  li  x11,0
+  li  x12,0
+  li  x13,0
+  li  x14,0
+  li  x15,0
+  li  x16,0
+  li  x17,0
+  li  x18,0
+  li  x19,0
+  li  x20,0
+  li  x21,0
+  li  x22,0
+  li  x23,0
+  li  x24,0
+  li  x25,0
+  li  x26,0
+  li  x27,0
+  li  x28,0
+  li  x29,0
+  li  x30,0
+  li  x31,0
+
+  jal main
+
+  li a7, 93
+  ecall
diff --git a/benchmarks/fibrec/CMakeLists.txt b/benchmarks/fibrec/CMakeLists.txt
new file mode 100644
index 0000000..1dcaec1
--- /dev/null
+++ b/benchmarks/fibrec/CMakeLists.txt
@@ -0,0 +1,2 @@
+prot_add_bench(fibrec fibrec.c)
+target_compile_definitions(fibrec PRIVATE N=31)
diff --git a/benchmarks/fibrec/fibrec.c b/benchmarks/fibrec/fibrec.c
new file mode 100644
index 0000000..539d818
--- /dev/null
+++ b/benchmarks/fibrec/fibrec.c
@@ -0,0 +1,31 @@
+#ifndef N
+#error "N was not defined"
+#endif
+
+int verify(unsigned r, unsigned n) {
+    static const unsigned calculated[] = {
+        1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 
+        1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 
+        196418, 317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 
+        9227465, 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 
+        267914296, 433494437, 701408733, 1134903170, 1836311903, 2971215073, 512559680, 
+        3483774753, 3996334433, 3185141890, 2886509027, 1776683621, 368225352, 2144908973, 
+        2513134325, 363076002, 2876210327, 3239286329, 1820529360, 764848393, 2585377753, 
+        3350226146, 1640636603,
+    };
+    if (r == calculated[n])
+        return 0;
+    return 1;
+}
+
+unsigned fib(unsigned n) {
+    if (n <= 1)
+        return 1;
+    return fib(n - 1) + fib(n - 2);
+}
+
+int main() {
+    unsigned r = fib(N);
+    return verify(r, N);
+}
+
diff --git a/benchmarks/median/CMakeLists.txt b/benchmarks/median/CMakeLists.txt
new file mode 100644
index 0000000..74be551
--- /dev/null
+++ b/benchmarks/median/CMakeLists.txt
@@ -0,0 +1 @@
+prot_add_bench(median median.c median_main.c)
diff --git a/benchmarks/median/median.c b/benchmarks/median/median.c
new file mode 100644
index 0000000..1999185
--- /dev/null
+++ b/benchmarks/median/median.c
@@ -0,0 +1,42 @@
+// See LICENSE for license details.
+
+//**************************************************************************
+// Median filter (c version)
+//--------------------------------------------------------------------------
+
+void median( int n, int input[], int results[] )
+{
+  int A, B, C, i;
+
+  // Zero the ends
+  results[0]   = 0;
+  results[n-1] = 0;
+
+  // Do the filter
+  for ( i = 1; i < (n-1); i++ ) {
+
+    A = input[i-1];
+    B = input[i];
+    C = input[i+1];
+
+    if ( A < B ) {
+      if ( B < C )
+        results[i] = B;
+      else if ( C < A )
+        results[i] = A;
+      else
+        results[i] = C;
+    }
+
+    else {
+      if ( A < C )
+        results[i] = A;
+      else if ( C < B )
+        results[i] = B;
+      else
+        results[i] = C;
+    }
+
+  }
+
+}
diff --git a/benchmarks/median/median.h b/benchmarks/median/median.h
new file mode 100644
index 0000000..7f9791c
--- /dev/null
+++ b/benchmarks/median/median.h
@@ -0,0 +1,11 @@
+// See LICENSE for license details.
+
+//**************************************************************************
+// Median filters
+//--------------------------------------------------------------------------
+
+// Simple C version
+void median( int n, int input[], int results[] );
+
+// Simple assembly version
+void median_asm( int n, int input[], int results[] );
diff --git a/benchmarks/median/median_gendata.pl b/benchmarks/median/median_gendata.pl
new file mode 100755
index 0000000..373904e
--- /dev/null
+++ b/benchmarks/median/median_gendata.pl
@@ -0,0 +1,140 @@
+#!/usr/bin/perl -w
+#==========================================================================
+# median_gendata.pl
+#
+# Author : Christopher Batten (cbatten@mit.edu)
+# Date   : May 9, 2005
+#
+(our $usageMsg = <<'ENDMSG') =~ s/^\#//gm;
+#
+# Simple script which creates an input data set and the reference data
+# for the median benchmark.
+#
+ENDMSG
+
+use strict "vars";
+use warnings;
+no  warnings("once");
+use Getopt::Long;
+
+#--------------------------------------------------------------------------
+# Command line processing
+#--------------------------------------------------------------------------
+
+our %opts;
+
+sub usage()
+{
+
+  print "\n";
+  print " Usage: median_gendata.pl [options] \n";
+  print "\n";
+  print " Options:\n";
+  print "  --help  print this message\n";
+  print "  --size  size of input data [750]\n";
+  print "  --seed  random seed [1]\n";
+  print "$usageMsg";
+
+  exit();
+}
+
+sub processCommandLine()
+{
+
+  $opts{"help"} = 0;
+  $opts{"size"} = 750;
+  $opts{"seed"} = 1;
+  Getopt::Long::GetOptions( \%opts, 'help|?', 'size:i', 'seed:i' ) or usage();
+  $opts{"help"} and usage();
+
+}
+
+#--------------------------------------------------------------------------
+# Helper Functions
+#--------------------------------------------------------------------------
+
+sub printArray
+{
+  my $arrayName = $_[0];
+  my $arrayRef  = $_[1];
+
+  my $numCols = 20;
+  my $arrayLen = scalar(@{$arrayRef});
+
+  print "int ".$arrayName."[DATA_SIZE] = \n";
+  print "{\n";
+
+  if ( $arrayLen <= $numCols ) {
+    print "  ";
+    for ( my $i = 0; $i < $arrayLen; $i++ ) {
+      print sprintf("%3d",$arrayRef->[$i]);
+      if ( $i != $arrayLen-1 ) {
+        print ", ";
+      }
+    }
+    print "\n";
+  }
+
+  else {
+    my $numRows = int($arrayLen/$numCols);
+    for ( my $j = 0; $j < $numRows; $j++ ) {
+      print "  ";
+      for ( my $i = 0; $i < $numCols; $i++ ) {
+        my $index = $j*$numCols + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+    if ( $arrayLen > ($numRows*$numCols) ) {
+      print "  ";
+      for ( my $i = 0; $i < ($arrayLen-($numRows*$numCols)); $i++ ) {
+        my $index = $numCols*$numRows + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+  }
+
+  print  "};\n\n";
+}
+
+#--------------------------------------------------------------------------
+# Main
+#--------------------------------------------------------------------------
+
+sub main()
+{
+
+  processCommandLine();
+  srand($opts{"seed"});
+
+  my @values;
+  for ( my $i = 0; $i < $opts{"size"}; $i++ ) {
+    push( @values, int(rand(999)) );
+  }
+
+  my @median;
+  $median[0] = 0;
+  $median[$opts{"size"}-1] = 0;
+  for ( my $i = 1; $i < $opts{"size"}-1; $i++ ) {
+    my @tempList = ( $values[$i-1], $values[$i], $values[$i+1] );
+    my @sorted = sort { $a <=> $b } @tempList;
+    $median[$i] = $sorted[1];
+  }
+
+  print "\n\#define DATA_SIZE ".$opts{"size"}." \n\n";
+  printArray( "input_data", \@values );
+  printArray( "verify_data", \@median );
+
+}
+
+main();
+
diff --git a/benchmarks/median/median_main.c b/benchmarks/median/median_main.c
new file mode 100644
index 0000000..2f9736f
--- /dev/null
+++ b/benchmarks/median/median_main.c
@@ -0,0 +1,36 @@
+// See LICENSE for license details.
+
+//**************************************************************************
+// Median filter bencmark
+//--------------------------------------------------------------------------
+//
+// This benchmark performs a 1D three element median filter. The
+// input data (and reference data) should be generated using the
+// median_gendata.pl perl script and dumped to a file named
+// dataset1.h.
+
+#include "bench/utils.h"
+#include "median.h"
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+#include "dataset1.h"
+
+//--------------------------------------------------------------------------
+// Main
+
+int main() {
+  int results_data[DATA_SIZE];
+
+#if PREALLOCATE
+  // If needed we preallocate everything in the caches
+  median(DATA_SIZE, input_data, results_data);
+#endif
+
+  // Do the filter
+  median(DATA_SIZE, input_data, results_data);
+
+  // Check the results
+  return verify(DATA_SIZE, results_data, verify_data);
+}
diff --git a/benchmarks/multiply/CMakeLists.txt b/benchmarks/multiply/CMakeLists.txt
new file mode 100644
index 0000000..d8b4c49
--- /dev/null
+++ b/benchmarks/multiply/CMakeLists.txt
@@ -0,0 +1 @@
+prot_add_bench(multiply multiply_main.c multiply.c)
diff --git a/benchmarks/multiply/multiply.c b/benchmarks/multiply/multiply.c
new file mode 100644
index 0000000..2d01e33
--- /dev/null
+++ b/benchmarks/multiply/multiply.c
@@ -0,0 +1,19 @@
+// See LICENSE for license details.
+
+// *************************************************************************
+// multiply function (c version)
+// -------------------------------------------------------------------------
+
+int multiply( int x, int y ) {
+ int i;
+ int result = 0;
+
+ for (i = 0; i < 32; i++) {
+   if ((x & 0x1) == 1)
+     result = result + y;
+   x = x >> 1;
+   y = y << 1;
+ } 
+ return result;
+}
+
diff --git a/benchmarks/multiply/multiply.h b/benchmarks/multiply/multiply.h
new file mode 100644
index 0000000..b2b1cf7
--- /dev/null
+++ b/benchmarks/multiply/multiply.h
@@ -0,0 +1,11 @@
+// See LICENSE for license details.
+
+//**************************************************************************
+// Software multiply function
+//--------------------------------------------------------------------------
+
+// Simple C version
+int multiply(int x, int y);
+
+// Simple assembly version
+int multiply_asm(int x, int y);
diff --git a/benchmarks/multiply/multiply_gendata.pl b/benchmarks/multiply/multiply_gendata.pl
new file mode 100755
index 0000000..b8d8ed5
--- /dev/null
+++ b/benchmarks/multiply/multiply_gendata.pl
@@ -0,0 +1,142 @@
+#!/usr/bin/perl -w
+#==========================================================================
+# multiply_gendata.pl
+#
+# Author : Christopher Batten (cbatten@mit.edu)
+# Date   : May 9, 2005
+#
+(our $usageMsg = <<'ENDMSG') =~ s/^\#//gm;
+#
+# Simple script which creates an input data set and the reference data 
+# for the multiply benchmark.
+#
+ENDMSG
+
+use strict "vars";
+use warnings;
+no  warnings("once");
+use Getopt::Long;
+
+#--------------------------------------------------------------------------
+# Command line processing
+#--------------------------------------------------------------------------
+
+our %opts;
+
+sub usage()
+{
+
+  print "\n";
+  print " Usage: multiply_gendata.pl [options] \n";
+  print "\n";
+  print " Options:\n";
+  print "  --help  print this message\n";
+  print "  --size  size of input data [750]\n";
+  print "  --seed  random seed [1]\n";
+  print "$usageMsg";
+
+  exit();
+}
+
+sub processCommandLine()
+{
+
+  $opts{"help"} = 0;
+  $opts{"size"} = 750;
+  $opts{"seed"} = 1;
+  Getopt::Long::GetOptions( \%opts, 'help|?', 'size:i', 'seed:i' ) or usage();
+  $opts{"help"} and usage();
+
+}
+
+#--------------------------------------------------------------------------
+# Helper Functions
+#--------------------------------------------------------------------------
+
+sub printArray
+{
+  my $arrayName = $_[0];
+  my $arrayRef  = $_[1];
+
+  my $numCols = 20;
+  my $arrayLen = scalar(@{$arrayRef});
+
+  print "int ".$arrayName."[DATA_SIZE] = \n";
+  print "{\n";
+
+  if ( $arrayLen <= $numCols ) {
+    print "  ";
+    for ( my $i = 0; $i < $arrayLen; $i++ ) {
+      print sprintf("%3d",$arrayRef->[$i]);
+      if ( $i != $arrayLen-1 ) {
+        print ", ";
+      }
+    }
+    print "\n";
+  }
+  
+  else {
+    my $numRows = int($arrayLen/$numCols);
+    for ( my $j = 0; $j < $numRows; $j++ ) {
+      print "  ";
+      for ( my $i = 0; $i < $numCols; $i++ ) {
+        my $index = $j*$numCols + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+    if ( $arrayLen > ($numRows*$numCols) ) {
+      print "  ";
+      for ( my $i = 0; $i < ($arrayLen-($numRows*$numCols)); $i++ ) {
+        my $index = $numCols*$numRows + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+  }
+
+  print  "};\n\n";
+}
+
+#--------------------------------------------------------------------------
+# Main
+#--------------------------------------------------------------------------
+
+sub main()
+{
+
+  processCommandLine();
+  srand($opts{"seed"});
+  
+  my @values1;
+  for ( my $i = 0; $i < $opts{"size"}; $i++ ) {
+    push( @values1, int(rand(999)) );
+  }
+  
+  my @values2;
+  for ( my $i = 0; $i < $opts{"size"}; $i++ ) {
+    push( @values2, int(rand(999)) );
+  }
+
+  my @multiply;
+  for ( my $i = 0; $i < $opts{"size"}; $i++ ) {
+    $multiply[$i] = $values1[$i] * $values2[$i];
+  }
+
+  print "\n\#define DATA_SIZE ".$opts{"size"}." \n\n";
+  printArray( "input_data1", \@values1 );
+  printArray( "input_data2", \@values2 );
+  printArray( "verify_data", \@multiply );
+
+}
+
+main();
+
diff --git a/benchmarks/multiply/multiply_main.c b/benchmarks/multiply/multiply_main.c
new file mode 100644
index 0000000..efc6e8a
--- /dev/null
+++ b/benchmarks/multiply/multiply_main.c
@@ -0,0 +1,39 @@
+// See LICENSE for license details.
+
+// *************************************************************************
+// multiply filter bencmark
+// -------------------------------------------------------------------------
+//
+// This benchmark tests the software multiply implemenation. The
+// input data (and reference data) should be generated using the
+// multiply_gendata.pl perl script and dumped to a file named
+// dataset1.h
+
+#include "bench/utils.h"
+#include "multiply.h"
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+#include "dataset1.h"
+
+//--------------------------------------------------------------------------
+// Main
+
+int main() {
+  int i;
+  int results_data[DATA_SIZE];
+
+#if PREALLOCATE
+  for (i = 0; i < DATA_SIZE; i++) {
+    results_data[i] = multiply(input_data1[i], input_data2[i]);
+  }
+#endif
+
+  for (i = 0; i < DATA_SIZE; i++) {
+    results_data[i] = multiply(input_data1[i], input_data2[i]);
+  }
+
+  // Check the results
+  return verify(DATA_SIZE, results_data, verify_data);
+}
diff --git a/benchmarks/nopbench/CMakeLists.txt b/benchmarks/nopbench/CMakeLists.txt
new file mode 100644
index 0000000..89de444
--- /dev/null
+++ b/benchmarks/nopbench/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(PROT_NOP_AMOUNT 1000000)
+configure_file(nopbench.s.in nopbench.s @ONLY)
+
+prot_add_bench(nopbench nopbench.s)
diff --git a/benchmarks/nopbench/nopbench.s.in b/benchmarks/nopbench/nopbench.s.in
new file mode 100644
index 0000000..f2e965d
--- /dev/null
+++ b/benchmarks/nopbench/nopbench.s.in
@@ -0,0 +1,9 @@
+.global main
+.section .text
+main:
+    .rept @PROT_NOP_AMOUNT@
+        nop
+    .endr
+    li a0, 0
+    li a7, 93
+    ecall
diff --git a/benchmarks/qsort/CMakeLists.txt b/benchmarks/qsort/CMakeLists.txt
new file mode 100644
index 0000000..307b298
--- /dev/null
+++ b/benchmarks/qsort/CMakeLists.txt
@@ -0,0 +1 @@
+prot_add_bench(qsort qsort.c)
diff --git a/benchmarks/qsort/qsort.c b/benchmarks/qsort/qsort.c
new file mode 100644
index 0000000..35b42ac
--- /dev/null
+++ b/benchmarks/qsort/qsort.c
@@ -0,0 +1,154 @@
+// See LICENSE for license details.
+
+//**************************************************************************
+// Quicksort benchmark
+//--------------------------------------------------------------------------
+//
+// This benchmark uses quicksort to sort an array of integers. The
+// implementation is largely adapted from Numerical Recipes for C. The
+// input data (and reference data) should be generated using the
+// qsort_gendata.pl perl script and dumped to a file named
+// dataset1.h.
+
+// The INSERTION_THRESHOLD is the size of the subarray when the
+// algorithm switches to using an insertion sort instead of
+// quick sort.
+
+#include "bench/utils.h"
+
+typedef unsigned size_t;
+
+#define INSERTION_THRESHOLD 10
+
+// NSTACK is the required auxiliary storage.
+// It must be at least 2*lg(DATA_SIZE)
+
+#define NSTACK 50
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+#define type int
+#include "dataset1.h"
+
+// Swap macro for swapping two values.
+
+#define SWAP(a, b)                                                             \
+  do {                                                                         \
+    typeof(a) temp = (a);                                                      \
+    (a) = (b);                                                                 \
+    (b) = temp;                                                                \
+  } while (0)
+#define SWAP_IF_GREATER(a, b)                                                  \
+  do {                                                                         \
+    if ((a) > (b))                                                             \
+      SWAP(a, b);                                                              \
+  } while (0)
+
+//--------------------------------------------------------------------------
+// Quicksort function
+
+static void insertion_sort(size_t n, type arr[]) {
+  type *i, *j;
+  type value;
+  for (i = arr + 1; i < arr + n; i++) {
+    value = *i;
+    j = i;
+    while (value < *(j - 1)) {
+      *j = *(j - 1);
+      if (--j == arr)
+        break;
+    }
+    *j = value;
+  }
+}
+
+// static void selection_sort(size_t n, type arr[]) {
+//   for (type *i = arr; i < arr + n - 1; i++)
+//     for (type *j = i + 1; j < arr + n; j++)
+//       SWAP_IF_GREATER(*i, *j);
+// }
+
+void sort(size_t n, type arr[]) {
+  type *ir = arr + n;
+  type *l = arr + 1;
+  type *stack[NSTACK];
+  type **stackp = stack;
+
+  for (;;) {
+    // Insertion sort when subarray small enough.
+    if (ir - l < INSERTION_THRESHOLD) {
+      insertion_sort(ir - l + 1, l - 1);
+
+      if (stackp == stack)
+        break;
+
+      // Pop stack and begin a new round of partitioning.
+      ir = *stackp--;
+      l = *stackp--;
+    } else {
+      // Choose median of left, center, and right elements as
+      // partitioning element a. Also rearrange so that a[l-1] <= a[l] <=
+      // a[ir-].
+      SWAP(arr[((l - arr) + (ir - arr)) / 2 - 1], l[0]);
+      SWAP_IF_GREATER(l[-1], ir[-1]);
+      SWAP_IF_GREATER(l[0], ir[-1]);
+      SWAP_IF_GREATER(l[-1], l[0]);
+
+      // Initialize pointers for partitioning.
+      type *i = l + 1;
+      type *j = ir;
+
+      // Partitioning element.
+      type a = l[0];
+
+      for (;;) { // Beginning of innermost loop.
+        while (*i++ < a)
+          ; // Scan up to find element > a.
+        while (*(j-- - 2) > a)
+          ; // Scan down to find element < a.
+        if (j < i)
+          break;            // Pointers crossed. Partitioning complete.
+        SWAP(i[-1], j[-1]); // Exchange elements.
+      } // End of innermost loop.
+
+      // Insert partitioning element.
+      l[0] = j[-1];
+      j[-1] = a;
+      stackp += 2;
+
+      // Push pointers to larger subarray on stack,
+      // process smaller subarray immediately.
+
+      if (ir - i + 1 >= j - l) {
+        stackp[0] = ir;
+        stackp[-1] = i;
+        ir = j - 1;
+      } else {
+        stackp[0] = j - 1;
+        stackp[-1] = l;
+        l = i;
+      }
+    }
+  }
+}
+
+//--------------------------------------------------------------------------
+// Main
+
+int main() {
+#if PREALLOCATE
+  // If needed we preallocate everything in the caches
+  sort(DATA_SIZE, verify_data);
+  if (verify(DATA_SIZE, input_data, input_data))
+    return 1;
+#endif
+
+  // Do the sort
+  // setStats(1);
+  sort(DATA_SIZE, input_data);
+  // setStats(0);
+
+  // Check the results
+  return verify(DATA_SIZE, input_data, verify_data);
+}
diff --git a/benchmarks/qsort/qsort_gendata.pl b/benchmarks/qsort/qsort_gendata.pl
new file mode 100755
index 0000000..92fc8fa
--- /dev/null
+++ b/benchmarks/qsort/qsort_gendata.pl
@@ -0,0 +1,132 @@
+#!/usr/bin/perl -w
+#==========================================================================
+# qsort_gendata.pl
+#
+# Author : Christopher Batten (cbatten@mit.edu)
+# Date   : April 29, 2005
+#
+(our $usageMsg = <<'ENDMSG') =~ s/^\#//gm;
+#
+# Simple script which creates an input data set and the reference data
+# for the qsort benchmark.
+#
+ENDMSG
+
+use strict "vars";
+use warnings;
+no  warnings("once");
+use Getopt::Long;
+
+#--------------------------------------------------------------------------
+# Command line processing
+#--------------------------------------------------------------------------
+
+our %opts;
+
+sub usage()
+{
+
+  print "\n";
+  print " Usage: qsort_gendata.pl [options] \n";
+  print "\n";
+  print " Options:\n";
+  print "  --help  print this message\n";
+  print "  --size  size of input data [250]\n";
+  print "  --seed  random seed [1]\n";
+  print "$usageMsg";
+
+  exit();
+}
+
+sub processCommandLine()
+{
+
+  $opts{"help"} = 0;
+  $opts{"size"} = 250;
+  $opts{"seed"} = 1;
+  Getopt::Long::GetOptions( \%opts, 'help|?', 'size:i', 'seed:i' ) or usage();
+  $opts{"help"} and usage();
+
+}
+
+#--------------------------------------------------------------------------
+# Helper Functions
+#--------------------------------------------------------------------------
+
+sub printArray
+{
+  my $arrayName = $_[0];
+  my $arrayRef  = $_[1];
+
+  my $numCols = 20;
+  my $arrayLen = scalar(@{$arrayRef});
+
+  print "type ".$arrayName."[DATA_SIZE] = \n";
+  print "{\n";
+
+  if ( $arrayLen <= $numCols ) {
+    print "  ";
+    for ( my $i = 0; $i < $arrayLen; $i++ ) {
+      print sprintf("%3d",$arrayRef->[$i]);
+      if ( $i != $arrayLen-1 ) {
+        print ", ";
+      }
+    }
+    print "\n";
+  }
+
+  else {
+    my $numRows = int($arrayLen/$numCols);
+    for ( my $j = 0; $j < $numRows; $j++ ) {
+      print "  ";
+      for ( my $i = 0; $i < $numCols; $i++ ) {
+        my $index = $j*$numCols + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+    if ( $arrayLen > ($numRows*$numCols) ) {
+      print "  ";
+      for ( my $i = 0; $i < ($arrayLen-($numRows*$numCols)); $i++ ) {
+        my $index = $numCols*$numRows + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+  }
+
+  print  "};\n\n";
+}
+
+#--------------------------------------------------------------------------
+# Main
+#--------------------------------------------------------------------------
+
+sub main()
+{
+
+  processCommandLine();
+  srand($opts{"seed"});
+
+  my @values;
+  for ( my $i = 0; $i < $opts{"size"}; $i++ ) {
+    push( @values, int(rand((1<<31)-1)) );
+  }
+  my @sorted = sort { $a <=> $b } @values;
+
+  print "\n\#define DATA_SIZE ".$opts{"size"}." \n\n";
+  printArray( "input_data", \@values );
+  printArray( "verify_data", \@sorted );
+
+}
+
+main();
+
diff --git a/benchmarks/rsort/CMakeLists.txt b/benchmarks/rsort/CMakeLists.txt
new file mode 100644
index 0000000..1872dc5
--- /dev/null
+++ b/benchmarks/rsort/CMakeLists.txt
@@ -0,0 +1 @@
+prot_add_bench(rsort rsort.c)
diff --git a/benchmarks/rsort/qsort_gendata.pl b/benchmarks/rsort/qsort_gendata.pl
new file mode 100755
index 0000000..92fc8fa
--- /dev/null
+++ b/benchmarks/rsort/qsort_gendata.pl
@@ -0,0 +1,132 @@
+#!/usr/bin/perl -w
+#==========================================================================
+# qsort_gendata.pl
+#
+# Author : Christopher Batten (cbatten@mit.edu)
+# Date   : April 29, 2005
+#
+(our $usageMsg = <<'ENDMSG') =~ s/^\#//gm;
+#
+# Simple script which creates an input data set and the reference data
+# for the qsort benchmark.
+#
+ENDMSG
+
+use strict "vars";
+use warnings;
+no  warnings("once");
+use Getopt::Long;
+
+#--------------------------------------------------------------------------
+# Command line processing
+#--------------------------------------------------------------------------
+
+our %opts;
+
+sub usage()
+{
+
+  print "\n";
+  print " Usage: qsort_gendata.pl [options] \n";
+  print "\n";
+  print " Options:\n";
+  print "  --help  print this message\n";
+  print "  --size  size of input data [250]\n";
+  print "  --seed  random seed [1]\n";
+  print "$usageMsg";
+
+  exit();
+}
+
+sub processCommandLine()
+{
+
+  $opts{"help"} = 0;
+  $opts{"size"} = 250;
+  $opts{"seed"} = 1;
+  Getopt::Long::GetOptions( \%opts, 'help|?', 'size:i', 'seed:i' ) or usage();
+  $opts{"help"} and usage();
+
+}
+
+#--------------------------------------------------------------------------
+# Helper Functions
+#--------------------------------------------------------------------------
+
+sub printArray
+{
+  my $arrayName = $_[0];
+  my $arrayRef  = $_[1];
+
+  my $numCols = 20;
+  my $arrayLen = scalar(@{$arrayRef});
+
+  print "type ".$arrayName."[DATA_SIZE] = \n";
+  print "{\n";
+
+  if ( $arrayLen <= $numCols ) {
+    print "  ";
+    for ( my $i = 0; $i < $arrayLen; $i++ ) {
+      print sprintf("%3d",$arrayRef->[$i]);
+      if ( $i != $arrayLen-1 ) {
+        print ", ";
+      }
+    }
+    print "\n";
+  }
+
+  else {
+    my $numRows = int($arrayLen/$numCols);
+    for ( my $j = 0; $j < $numRows; $j++ ) {
+      print "  ";
+      for ( my $i = 0; $i < $numCols; $i++ ) {
+        my $index = $j*$numCols + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+    if ( $arrayLen > ($numRows*$numCols) ) {
+      print "  ";
+      for ( my $i = 0; $i < ($arrayLen-($numRows*$numCols)); $i++ ) {
+        my $index = $numCols*$numRows + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+  }
+
+  print  "};\n\n";
+}
+
+#--------------------------------------------------------------------------
+# Main
+#--------------------------------------------------------------------------
+
+sub main()
+{
+
+  processCommandLine();
+  srand($opts{"seed"});
+
+  my @values;
+  for ( my $i = 0; $i < $opts{"size"}; $i++ ) {
+    push( @values, int(rand((1<<31)-1)) );
+  }
+  my @sorted = sort { $a <=> $b } @values;
+
+  print "\n\#define DATA_SIZE ".$opts{"size"}." \n\n";
+  printArray( "input_data", \@values );
+  printArray( "verify_data", \@sorted );
+
+}
+
+main();
+
diff --git a/benchmarks/rsort/rsort.c b/benchmarks/rsort/rsort.c
new file mode 100644
index 0000000..45f7ea9
--- /dev/null
+++ b/benchmarks/rsort/rsort.c
@@ -0,0 +1,117 @@
+// See LICENSE for license details.
+
+//**************************************************************************
+// Radix Sort benchmark
+//--------------------------------------------------------------------------
+//
+// This benchmark uses radix sort to sort an array of integers. The
+// implementation is largely adapted from Numerical Recipes for C. The
+// input data (and reference data) should be generated using the
+// qsort_gendata.pl perl script and dumped to a file named
+// dataset1.h
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+#include "bench/utils.h"
+
+#define type unsigned int
+#include "dataset1.h"
+
+#define LOG_BASE 8
+#define BASE (1 << LOG_BASE)
+
+#if 0
+#define fetch_add(ptr, inc) __sync_fetch_and_add(ptr, inc)
+#else
+#define fetch_add(ptr, inc)  ((*(ptr) += (inc)) - (inc))
+#define fetch_add1(ptr, inc)  (*(ptr) += (inc))
+#endif
+
+void memcpy_impl(void *dst, void const *src, unsigned n) {
+  for (unsigned i = 0; i < n; ++i)
+    ((char *)(dst))[i] = ((char *)(src))[i];
+}
+
+void sort(unsigned n, type *arrIn, type *scratchIn) {
+  unsigned log_exp = 0;
+  unsigned buckets[BASE];
+  unsigned *bucket = buckets;
+  asm("" : "+r"(bucket));
+  type *arr = arrIn, *scratch = scratchIn, *p;
+  unsigned *b;
+
+  while (log_exp < 8 * sizeof(type)) {
+    for (b = bucket; b < bucket + BASE; b++)
+      *b = 0;
+
+    for (p = arr; p < &arr[n - 3]; p += 4) {
+      type a0 = p[0];
+      type a1 = p[1];
+      type a2 = p[2];
+      type a3 = p[3];
+      fetch_add1(&bucket[(a0 >> log_exp) % BASE], 1);
+      fetch_add1(&bucket[(a1 >> log_exp) % BASE], 1);
+      fetch_add1(&bucket[(a2 >> log_exp) % BASE], 1);
+      fetch_add1(&bucket[(a3 >> log_exp) % BASE], 1);
+    }
+    for (; p < &arr[n]; p++)
+      bucket[(*p >> log_exp) % BASE]++;
+
+    unsigned prev = bucket[0];
+    prev += fetch_add(&bucket[1], prev);
+    for (b = &bucket[2]; b < bucket + BASE; b += 2) {
+      prev += fetch_add(&b[0], prev);
+      prev += fetch_add(&b[1], prev);
+    }
+
+    for (p = &arr[n - 1]; p >= &arr[3]; p -= 4) {
+      type a0 = p[-0];
+      type a1 = p[-1];
+      type a2 = p[-2];
+      type a3 = p[-3];
+      unsigned *pb0 = &bucket[(a0 >> log_exp) % BASE];
+      unsigned *pb1 = &bucket[(a1 >> log_exp) % BASE];
+      unsigned *pb2 = &bucket[(a2 >> log_exp) % BASE];
+      unsigned *pb3 = &bucket[(a3 >> log_exp) % BASE];
+      type *s0 = scratch + fetch_add(pb0, -1);
+      type *s1 = scratch + fetch_add(pb1, -1);
+      type *s2 = scratch + fetch_add(pb2, -1);
+      type *s3 = scratch + fetch_add(pb3, -1);
+      s0[-1] = a0;
+      s1[-1] = a1;
+      s2[-1] = a2;
+      s3[-1] = a3;
+    }
+    for (; p >= &arr[0]; p--)
+      scratch[--bucket[(*p >> log_exp) % BASE]] = *p;
+
+    type *tmp = arr;
+    arr = scratch;
+    scratch = tmp;
+
+    log_exp += LOG_BASE;
+  }
+  if (arr != arrIn)
+    memcpy_impl(arr, scratch, n * sizeof(type));
+}
+
+//--------------------------------------------------------------------------
+// Main
+
+int main() {
+  static type scratch[DATA_SIZE];
+
+#if PREALLOCATE
+  // If needed we preallocate everything in the caches
+  sort(DATA_SIZE, verify_data, scratch);
+  if (verify(DATA_SIZE, input_data, input_data))
+    return 1;
+#endif
+
+  // Do the sort
+  sort(DATA_SIZE, input_data, scratch);
+
+  // Check the results
+  return verify(DATA_SIZE, (const int *)input_data, (const int *)verify_data);
+}
diff --git a/benchmarks/towers/CMakeLists.txt b/benchmarks/towers/CMakeLists.txt
new file mode 100644
index 0000000..96aac4e
--- /dev/null
+++ b/benchmarks/towers/CMakeLists.txt
@@ -0,0 +1 @@
+prot_add_bench(towers towers_main.c)
diff --git a/benchmarks/towers/towers_main.c b/benchmarks/towers/towers_main.c
new file mode 100644
index 0000000..7cef530
--- /dev/null
+++ b/benchmarks/towers/towers_main.c
@@ -0,0 +1,207 @@
+// See LICENSE for license details.
+
+//**************************************************************************
+// Towers of Hanoi benchmark
+//--------------------------------------------------------------------------
+//
+// Towers of Hanoi is a classic puzzle problem. The game consists of
+// three pegs and a set of discs. Each disc is a different size, and
+// initially all of the discs are on the left most peg with the smallest
+// disc on top and the largest disc on the bottom. The goal is to move all
+// of the discs onto the right most peg. The catch is that you are only
+// allowed to move one disc at a time and you can never place a larger
+// disc on top of a smaller disc.
+//
+// This implementation starts with NUM_DISC discs and uses a recursive
+// algorithm to sovel the puzzle.
+
+// This is the number of discs in the puzzle.
+
+#define NUM_DISCS 20
+
+//--------------------------------------------------------------------------
+// List data structure and functions
+
+struct Node {
+  int val;
+  struct Node *next;
+};
+
+struct List {
+  int size;
+  struct Node *head;
+};
+
+struct List g_nodeFreeList;
+struct Node g_nodePool[NUM_DISCS];
+
+int list_getSize(struct List *list) { return list->size; }
+
+void list_init(struct List *list) {
+  list->size = 0;
+  list->head = 0;
+}
+
+void list_push(struct List *list, int val) {
+  struct Node *newNode;
+
+  // Pop the next free node off the free list
+  newNode = g_nodeFreeList.head;
+  g_nodeFreeList.head = g_nodeFreeList.head->next;
+
+  // Push the new node onto the given list
+  newNode->next = list->head;
+  list->head = newNode;
+
+  // Assign the value
+  list->head->val = val;
+
+  // Increment size
+  list->size++;
+}
+
+int list_pop(struct List *list) {
+  struct Node *freedNode;
+  int val;
+
+  // Get the value from the->head of given list
+  val = list->head->val;
+
+  // Pop the head node off the given list
+  freedNode = list->head;
+  list->head = list->head->next;
+
+  // Push the freed node onto the free list
+  freedNode->next = g_nodeFreeList.head;
+  g_nodeFreeList.head = freedNode;
+
+  // Decrement size
+  list->size--;
+
+  return val;
+}
+
+void list_clear(struct List *list) {
+  while (list_getSize(list) > 0)
+    list_pop(list);
+}
+
+//--------------------------------------------------------------------------
+// Tower data structure and functions
+
+struct Towers {
+  int numDiscs;
+  int numMoves;
+  struct List pegA;
+  struct List pegB;
+  struct List pegC;
+};
+
+void towers_init(struct Towers *this, int n) {
+  int i;
+
+  this->numDiscs = n;
+  this->numMoves = 0;
+
+  list_init(&(this->pegA));
+  list_init(&(this->pegB));
+  list_init(&(this->pegC));
+
+  for (i = 0; i < n; i++)
+    list_push(&(this->pegA), n - i);
+}
+
+void towers_clear(struct Towers *this) {
+
+  list_clear(&(this->pegA));
+  list_clear(&(this->pegB));
+  list_clear(&(this->pegC));
+
+  towers_init(this, this->numDiscs);
+}
+
+void towers_solve_h(struct Towers *this, int n, struct List *startPeg,
+                    struct List *tempPeg, struct List *destPeg) {
+  int val;
+
+  if (n == 1) {
+    val = list_pop(startPeg);
+    list_push(destPeg, val);
+    this->numMoves++;
+  } else {
+    towers_solve_h(this, n - 1, startPeg, destPeg, tempPeg);
+    towers_solve_h(this, 1, startPeg, tempPeg, destPeg);
+    towers_solve_h(this, n - 1, tempPeg, startPeg, destPeg);
+  }
+}
+
+void towers_solve(struct Towers *this) {
+  towers_solve_h(this, this->numDiscs, &(this->pegA), &(this->pegB),
+                 &(this->pegC));
+}
+
+int towers_verify(struct Towers *this) {
+  struct Node *ptr;
+  int numDiscs = 0;
+
+  if (list_getSize(&this->pegA) != 0) {
+    return 2;
+  }
+
+  if (list_getSize(&this->pegB) != 0) {
+    return 3;
+  }
+
+  if (list_getSize(&this->pegC) != this->numDiscs) {
+    return 4;
+  }
+
+  for (ptr = this->pegC.head; ptr != 0; ptr = ptr->next) {
+    numDiscs++;
+    if (ptr->val != numDiscs) {
+      return 5;
+    }
+  }
+
+  if (this->numMoves != ((1 << this->numDiscs) - 1)) {
+    return 6;
+  }
+
+  return 0;
+}
+
+//--------------------------------------------------------------------------
+// Main
+
+int main() {
+  struct Towers towers;
+  int i;
+
+  // Initialize free list
+
+  list_init(&g_nodeFreeList);
+  g_nodeFreeList.head = &(g_nodePool[0]);
+  g_nodeFreeList.size = NUM_DISCS;
+  g_nodePool[NUM_DISCS - 1].next = 0;
+  g_nodePool[NUM_DISCS - 1].val = 99;
+  for (i = 0; i < (NUM_DISCS - 1); i++) {
+    g_nodePool[i].next = &(g_nodePool[i + 1]);
+    g_nodePool[i].val = i;
+  }
+
+  towers_init(&towers, NUM_DISCS);
+
+  // If needed we preallocate everything in the caches
+
+#if PREALLOCATE
+  towers_solve(&towers);
+#endif
+
+  // Solve it
+
+  towers_clear(&towers);
+  towers_solve(&towers);
+
+  // Check the results
+  return towers_verify(&towers);
+}
diff --git a/benchmarks/vvadd/CMakeLists.txt b/benchmarks/vvadd/CMakeLists.txt
new file mode 100644
index 0000000..67bc747
--- /dev/null
+++ b/benchmarks/vvadd/CMakeLists.txt
@@ -0,0 +1 @@
+prot_add_bench(vvadd vvadd_main.c)
diff --git a/benchmarks/vvadd/vvadd_gendata.pl b/benchmarks/vvadd/vvadd_gendata.pl
new file mode 100755
index 0000000..f23cdf4
--- /dev/null
+++ b/benchmarks/vvadd/vvadd_gendata.pl
@@ -0,0 +1,139 @@
+#!/usr/bin/perl -w
+#==========================================================================
+# vvadd_gendata.pl
+#
+# Author : Christopher Batten (cbatten@mit.edu)
+# Date   : April 29, 2005
+#
+(our $usageMsg = <<'ENDMSG') =~ s/^\#//gm;
+#
+# Simple script which creates an input data set and the reference data
+# for the vvadd benchmark.
+#
+ENDMSG
+
+use strict "vars";
+use warnings;
+no  warnings("once");
+use Getopt::Long;
+
+#--------------------------------------------------------------------------
+# Command line processing
+#--------------------------------------------------------------------------
+
+our %opts;
+
+sub usage()
+{
+
+  print "\n";
+  print " Usage: vvadd_gendata.pl [options] \n";
+  print "\n";
+  print " Options:\n";
+  print "  --help  print this message\n";
+  print "  --size  size of input data [1000]\n";
+  print "  --seed  random seed [1]\n";
+  print "$usageMsg";
+
+  exit();
+}
+
+sub processCommandLine()
+{
+
+  $opts{"help"} = 0;
+  $opts{"size"} = 1000;
+  $opts{"seed"} = 1;
+  Getopt::Long::GetOptions( \%opts, 'help|?', 'size:i', 'seed:i' ) or usage();
+  $opts{"help"} and usage();
+
+}
+
+#--------------------------------------------------------------------------
+# Helper Functions
+#--------------------------------------------------------------------------
+
+sub printArray
+{
+  my $arrayName = $_[0];
+  my $arrayRef  = $_[1];
+
+  my $numCols = 20;
+  my $arrayLen = scalar(@{$arrayRef});
+
+  print "int ".$arrayName."[DATA_SIZE] = \n";
+  print "{\n";
+
+  if ( $arrayLen <= $numCols ) {
+    print "  ";
+    for ( my $i = 0; $i < $arrayLen; $i++ ) {
+      print sprintf("%3d",$arrayRef->[$i]);
+      if ( $i != $arrayLen-1 ) {
+        print ", ";
+      }
+    }
+    print "\n";
+  }
+
+  else {
+    my $numRows = int($arrayLen/$numCols);
+    for ( my $j = 0; $j < $numRows; $j++ ) {
+      print "  ";
+      for ( my $i = 0; $i < $numCols; $i++ ) {
+        my $index = $j*$numCols + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+    if ( $arrayLen > ($numRows*$numCols) ) {
+      print "  ";
+      for ( my $i = 0; $i < ($arrayLen-($numRows*$numCols)); $i++ ) {
+        my $index = $numCols*$numRows + $i;
+        print sprintf("%3d",$arrayRef->[$index]);
+        if ( $index != $arrayLen-1 ) {
+          print ", ";
+        }
+      }
+      print "\n";
+    }
+
+  }
+
+  print  "};\n\n";
+}
+
+#--------------------------------------------------------------------------
+# Main
+#--------------------------------------------------------------------------
+
+sub main()
+{
+
+  processCommandLine();
+  srand($opts{"seed"});
+
+  my @values1;
+  my @values2;
+  my @sum;
+  for ( my $i = 0; $i < $opts{"size"}; $i++ ) {
+    my $value1 = int(rand(999));
+    my $value2 = int(rand(999));
+    push( @values1, $value1 );
+    push( @values2, $value2 );
+    push( @sum, $value1 + $value2 );
+  }
+
+
+  print "\n\#define DATA_SIZE ".$opts{"size"}." \n\n";
+  printArray( "input1_data", \@values1 );
+  printArray( "input2_data", \@values2 );
+  printArray( "verify_data", \@sum );
+
+}
+
+main();
+
diff --git a/benchmarks/vvadd/vvadd_main.c b/benchmarks/vvadd/vvadd_main.c
new file mode 100644
index 0000000..87dc241
--- /dev/null
+++ b/benchmarks/vvadd/vvadd_main.c
@@ -0,0 +1,44 @@
+// See LICENSE for license details.
+
+//**************************************************************************
+// Vector-vector add benchmark
+//--------------------------------------------------------------------------
+//
+// This benchmark uses adds to vectors and writes the results to a
+// third vector. The input data (and reference data) should be
+// generated using the vvadd_gendata.pl perl script and dumped
+// to a file named dataset1.h.
+
+#include "bench/utils.h"
+
+//--------------------------------------------------------------------------
+// Input/Reference Data
+
+#include "dataset1.h"
+
+//--------------------------------------------------------------------------
+// vvadd function
+
+void vvadd(int n, int a[], int b[], int c[]) {
+  int i;
+  for (i = 0; i < n; i++)
+    c[i] = a[i] + b[i];
+}
+
+//--------------------------------------------------------------------------
+// Main
+
+int main() {
+  int results_data[DATA_SIZE];
+
+#if PREALLOCATE
+  // If needed we preallocate everything in the caches
+  vvadd(DATA_SIZE, input1_data, input2_data, results_data);
+#endif
+
+  // Do the vvadd
+  vvadd(DATA_SIZE, input1_data, input2_data, results_data);
+
+  // Check the results
+  return verify(DATA_SIZE, results_data, verify_data);
+}
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
index 55dca5e..a7853a1 100644
--- a/cmake/dependencies.cmake
+++ b/cmake/dependencies.cmake
@@ -38,7 +38,8 @@ CPMAddPackage(
   VERSION 7.30
   EXCLUDE_FROM_ALL True
   SYSTEM True
-  DOWNLOAD_ONLY)
+  DOWNLOAD_ONLY True
+)
 
 # llvm
 find_package(LLVM 18.1.3 CONFIG REQUIRED)
@@ -68,4 +69,5 @@ CPMAddPackage(
   EXCLUDE_FROM_ALL True
   SYSTEM True
   OPTIONS "BUILD_TESTING OFF"
-  DOWNLOAD_ONLY True)
+  DOWNLOAD_ONLY True
+)
diff --git a/cmake/toolchain/riscv.cmake b/cmake/toolchain/riscv.cmake
new file mode 100644
index 0000000..6a60de9
--- /dev/null
+++ b/cmake/toolchain/riscv.cmake
@@ -0,0 +1,65 @@
+# RISC-V Cross Compilation Toolchain File Usage: cmake
+# -DCMAKE_TOOLCHAIN_FILE=path/to/this/file.cmake ..
+
+# Base settings
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_SYSTEM_PROCESSOR riscv)
+set(CMAKE_LINKER_TYPE DEFAULT)
+
+# Toolchain prefix (modify this according to your toolchain) Common prefixes:
+# riscv64-unknown-elf-, riscv64-unknown-linux-gnu-, riscv32-unknown-elf-
+set(RISCV_TOOLCHAIN_PREFIX
+    "riscv32-unknown-elf-"
+    CACHE STRING "RISC-V toolchain prefix")
+
+# Target architecture (modify these according to your needs)
+set(RISCV_ARCH
+    "rv32i"
+    CACHE STRING "RISC-V architecture")
+set(RISCV_ABI
+    "ilp32"
+    CACHE STRING "RISC-V ABI")
+
+# Cross-compilation tools
+if(DEFINED RISCV_TOOLCHAIN_DIR)
+  set(CMAKE_C_COMPILER "${RISCV_TOOLCHAIN_DIR}/${RISCV_TOOLCHAIN_PREFIX}gcc")
+  set(CMAKE_CXX_COMPILER "${RISCV_TOOLCHAIN_DIR}/${RISCV_TOOLCHAIN_PREFIX}g++")
+  set(CMAKE_ASM_COMPILER "${RISCV_TOOLCHAIN_DIR}/${RISCV_TOOLCHAIN_PREFIX}gcc")
+  set(CMAKE_AR "${RISCV_TOOLCHAIN_DIR}/${RISCV_TOOLCHAIN_PREFIX}ar")
+  set(CMAKE_RANLIB "${RISCV_TOOLCHAIN_DIR}/${RISCV_TOOLCHAIN_PREFIX}ranlib")
+else()
+  set(CMAKE_C_COMPILER "${RISCV_TOOLCHAIN_PREFIX}gcc")
+  set(CMAKE_CXX_COMPILER "${RISCV_TOOLCHAIN_PREFIX}g++")
+  set(CMAKE_ASM_COMPILER "${RISCV_TOOLCHAIN_PREFIX}gcc")
+  set(CMAKE_AR "${RISCV_TOOLCHAIN_PREFIX}ar")
+  set(CMAKE_RANLIB "${RISCV_TOOLCHAIN_PREFIX}ranlib")
+endif()
+
+# Compiler flags Search settings
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+# Optional: Specify sysroot if needed set(CMAKE_SYSROOT
+# "/path/to/riscv/sysroot")
+
+# Optional: Additional flags for specific use cases
+set(COMMON_FLAGS
+    "-march=${RISCV_ARCH} -mabi=${RISCV_ABI} -Wall -Wextra -nostdlib -nodefaultlibs -nostartfiles -static -fno-builtin"
+)
+set(CMAKE_C_FLAGS
+    "${COMMON_FLAGS}"
+    CACHE STRING "C compiler flags")
+set(CMAKE_ASM_FLAGS
+    "${COMMON_FLAGS}"
+    CACHE STRING "ASM compiler flags")
+
+set(CMAKE_EXE_LINKER_FLAGS_INIT
+    "-nostdlib -nodefaultlibs -nostartfiles -fno-builtin -static"
+    CACHE STRING "Linker flags")
+
+# Test compiler
+if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.6)
+  set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+endif()
diff --git a/src/hart/include/prot/hart.hh b/src/hart/include/prot/hart.hh
index 929421d..68b4f9c 100644
--- a/src/hart/include/prot/hart.hh
+++ b/src/hart/include/prot/hart.hh
@@ -29,6 +29,8 @@ public:
 
   auto getExitCode() { return m_cpu->getExitCode(); }
 
+  auto getIcount() const { return m_cpu->icount; }
+
 private:
   std::unique_ptr<Memory> m_mem;
   std::unique_ptr<CPUState> m_cpu;
diff --git a/src/jit/lightning/CMakeLists.txt b/src/jit/lightning/CMakeLists.txt
index 8317738..a5183a0 100644
--- a/src/jit/lightning/CMakeLists.txt
+++ b/src/jit/lightning/CMakeLists.txt
@@ -22,9 +22,9 @@ ExternalProject_Add(
   # Build & install
   BUILD_COMMAND ${MAKE_EXECUTABLE} -j
   INSTALL_COMMAND ${MAKE_EXECUTABLE} install
-  LOG_CONFIGURE True
-  LOG_BUILD True
-  LOG_MERGED_STDOUTERR True
+  LOG_CONFIGURE False
+  LOG_BUILD False
+  LOG_MERGED_STDOUTERR False
   LOG_OUTPUT_ON_FAILURE True
   BUILD_IN_SOURCE 1
   BUILD_BYPRODUCTS ${LIGHTNING_LIBRARY})
diff --git a/src/jit/llvm/llvmbasedjit.cc b/src/jit/llvm/llvmbasedjit.cc
index 15bd470..fd51869 100644
--- a/src/jit/llvm/llvmbasedjit.cc
+++ b/src/jit/llvm/llvmbasedjit.cc
@@ -98,6 +98,7 @@ void doSyscall(CPUState &state) { state.emulateSysCall(); }
 
 class LLVMBasedJIT : public JitEngine {
   std::unique_ptr<llvm::orc::LLJIT> m_jit;
+  std::unordered_map<isa::Word, llvm::orc::ExecutorAddr> m_cache;
 
   using TBFunc = void (*)(CPUState &);
 
@@ -107,9 +108,11 @@ class LLVMBasedJIT : public JitEngine {
 private:
   bool doJIT(CPUState &state) override {
     const auto pc = state.getPC();
-    auto found = m_jit->lookup(std::to_string(pc));
-    if (found) {
-      std::invoke(found->toPtr<TBFunc>(), state);
+    llvm::orc::ExecutorAddr found;
+    auto it = m_cache.find(pc);
+    if (it != m_cache.end()) {
+      found = it->second;
+      std::invoke(found.toPtr<TBFunc>(), state);
       const auto *bbInfo = getBBInfo(pc);
       state.icount += bbInfo->insns.size();
       return true;
@@ -128,9 +131,12 @@ class LLVMBasedJIT : public JitEngine {
       return false;
     }
 
-    found = m_jit->lookup(std::to_string(pc));
+    bool hasFound;
+    std::tie(it, hasFound) =
+        m_cache.insert({pc, *m_jit->lookup(std::to_string(pc))});
+    found = it->second;
     assert(found);
-    std::invoke(found->toPtr<TBFunc>(), state);
+    std::invoke(found.toPtr<TBFunc>(), state);
     state.icount += bbInfo->insns.size();
     return true;
   }
diff --git a/src/jit/mir/CMakeLists.txt b/src/jit/mir/CMakeLists.txt
index 5e12a40..bc8a7bc 100644
--- a/src/jit/mir/CMakeLists.txt
+++ b/src/jit/mir/CMakeLists.txt
@@ -6,6 +6,7 @@ add_library(prot_mir OBJECT ${mir_SOURCE_DIR}/mir.c
 
 add_library(prot_mir_static STATIC)
 target_link_libraries(prot_mir_static PRIVATE prot_mir)
+target_compile_features(prot_mir_static PRIVATE c_std_11)
 
 add_library(prot_jit_mir STATIC mir.cc)
 
diff --git a/tools/bench/Benchmark.py b/tools/bench/Benchmark.py
new file mode 100644
index 0000000..de8781b
--- /dev/null
+++ b/tools/bench/Benchmark.py
@@ -0,0 +1,98 @@
+import subprocess
+
+sim_exe_path = None
+
+
+class BenchmarkData:
+    def __init__(self, time_s: float, instruction_count: int, threshold_num: int):
+        self.time_s = time_s
+        self.instruction_count = instruction_count
+        self.threshold_num = threshold_num
+
+    def mips(self) -> float:
+        return self.instruction_count / (1000000 * self.time_s)
+
+    def time(self) -> float:
+        return self.time_s
+
+    def icount(self) -> int:
+        return self.instruction_count
+
+    def threshold(self) -> int:
+        return self.threshold_num
+
+
+class BenchmarkRequest:
+    def __init__(self, elf_path: str, threshold_num: int = None, jit_name: str = 'interp'):
+        self.threshold_num = threshold_num
+        self.jit_name = jit_name
+        self.elf_path = elf_path
+    
+
+class BenchmarkExitResult:
+    def __init__(self, exit_code: int, stdout: str, stderr: str):
+        self.exit_code = exit_code
+        self.stdout = stdout.rstrip()
+        self.stderr = stderr.rstrip()
+
+    def benchmark_data(self) -> BenchmarkData:
+        self.assert_success()
+        return self.parse_stdout()
+    
+    def assert_success(self) -> None:
+        assert self.exit_code == 0, self.stderr
+        assert self.stderr == ''
+
+    def assert_error(self) -> None:
+        assert self.exit_code != 0, self.stdout
+        assert self.stdout == ''
+
+    def parse_stdout(self) -> BenchmarkData:
+        result = {}
+        for line in self.stdout.splitlines():
+            splitted = line.split(':', 1)
+            if (len(splitted) == 1): 
+                continue
+            key = splitted[0].strip()
+            value = splitted[1].strip().rstrip('s')
+            result[key] = value
+        return BenchmarkData(
+            float(result["time"]), 
+            int(result["icount"]), 
+            int(result["threshold"])
+        )
+
+
+class Benchmark:
+    def __init__(self, request: BenchmarkRequest):
+        self.request = request
+        pass
+    
+    def run(self):
+        command = [str(sim_exe_path)]
+        if (self.request.threshold_num != None):
+            # command.extend(['--thsreshold', str(self.request.threshold_num)])
+            command.extend(['--jit', str(self.request.jit_name)])
+        command.extend([str(self.request.elf_path)])
+        self._process = subprocess.Popen(
+            command,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1,
+        )
+        
+        assert self._process.stdin is not None, "Failed to open STDIN"
+        assert self._process.stdout is not None, "Failed to open STDOUT"
+        assert self._process.stderr is not None, "Failed to open STDERR"
+
+    def exit(self, timeout=100) -> BenchmarkExitResult:
+        self._process.stdin.close()
+        return self.wait_for_exit(timeout)
+
+    def wait_for_exit(self, timeout=10) -> BenchmarkExitResult:
+        exit_code = self._process.wait(timeout)
+        stdout = self._process.stdout.read()
+        stderr = self._process.stderr.read()
+        return BenchmarkExitResult(exit_code, stdout, stderr)
diff --git a/tools/bench/DoBenchmark.py b/tools/bench/DoBenchmark.py
new file mode 100644
index 0000000..056bf06
--- /dev/null
+++ b/tools/bench/DoBenchmark.py
@@ -0,0 +1,135 @@
+import statistics
+import seaborn as sns
+import matplotlib.pyplot as plt
+import pandas as pd
+import sys
+from itertools import repeat
+from tqdm.contrib.concurrent import thread_map
+import numpy as np
+import argparse
+from pathlib import Path
+import Benchmark
+
+
+class BenchmarkList:
+    def __init__(self, data: list[Benchmark.BenchmarkData], name: str):
+        self.data = data
+        self.name = name
+
+    def name(self):
+        return self.name
+
+    def data(self):
+        return self.data
+
+    def mips(self):
+        return statistics.geometric_mean([x.mips() for x in self.data])
+
+
+class HistData:
+    def __init__(self, bench: str, mips: float, backend: str):
+        self.bench = bench
+        self.mips = mips
+        self.backend = backend
+
+
+def build_hist(list_data: list[list[BenchmarkList]], bench_names: list[str]):
+    num_groups = len(list_data)
+    num_bars_per_group = len(list_data[0]) if num_groups > 0 else 0
+
+    data = {"Benchmark": [], "MIPS": [], "Backend": []}
+
+    for j in range(num_bars_per_group):
+        for i in range(num_groups):
+            data["Benchmark"].append(bench_names[i])
+            data["MIPS"].append(np.mean(list_data[i][j].mips()))
+            data["Backend"].append(list_data[i][j].name)
+
+    df = pd.DataFrame(data)
+    print(df)
+    filtered_df = df[~df["Backend"].isin(["interp", "cached interp"])]
+    avg_df = (
+        filtered_df.groupby("Benchmark")
+        .agg(
+            MIPS=("MIPS", "mean"),
+        )
+        .reset_index()
+        .assign(Backend="averaged-jit")
+    )
+
+    final_df = pd.concat([df, avg_df], ignore_index=True)
+    df = final_df
+
+    sns.set(style="whitegrid")
+
+    plt.figure(figsize=(12, 8))
+    bar_plot = sns.barplot(
+        x="Benchmark",
+        y="MIPS",
+        hue="Backend",
+        data=df,
+        palette=sns.color_palette(),
+    )
+    plt.ylabel("MIPS")
+    plt.legend(
+        title="Backends",
+        loc="upper right",
+        fontsize="small",
+        title_fontsize="medium",
+    )
+    sns.move_legend(bar_plot, "upper left", bbox_to_anchor=(1, 1))
+    plt.tight_layout()
+    plt.savefig("hist.png")
+    plt.close()
+
+
+def run_benchmark(request: Benchmark.BenchmarkRequest, times: int):
+    bench = Benchmark.Benchmark(request)
+    results = []
+    for _ in range(times):
+        bench.run()
+        exit_result = bench.exit()
+        exit_result.assert_success()
+        results.append(exit_result.benchmark_data())
+    return BenchmarkList(results, request.jit_name)
+
+
+def run_elf(elf, th_num: int, times: int):
+    requests = [Benchmark.BenchmarkRequest(elf)]
+    jits = ["cached-interp", "llvm", "xbyak", "asmjit", "lightning", "mir"]
+    for jit in jits:
+        requests.append(Benchmark.BenchmarkRequest(elf, th_num, jit))
+    data = []
+    print("Running: " + benchmark_name(elf))
+
+    for req in thread_map(run_benchmark, requests, repeat(times)):
+        data.append(req)
+    return data, benchmark_name(elf)
+
+
+def run_benchmarks(elf_paths: list[str], threshold_num: int, times: int):
+    list_data = []
+    bench_names = []
+    for data, name in map(
+        run_elf, elf_paths, repeat(threshold_num), repeat(times)
+    ):
+        list_data.append(data)
+        bench_names.append(name)
+    build_hist(list_data, bench_names)
+
+
+def benchmark_name(elf_path: Path):
+    return elf_path.name
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Tool for benchmarking")
+    parser.add_argument("sim_bin", type=Path, help="Path to sim binary")
+    parser.add_argument("elf", type=Path, nargs="+", help="Path(s) to benchmarks")
+    parser.add_argument("-t", "--times", type=int, default=2, help="Amount of times to run each bench")
+    parser.add_argument("--thres", type=int, default=0, help="JIT threshold")
+
+    args = parser.parse_args()
+
+    Benchmark.sim_exe_path = args.sim_bin
+    run_benchmarks(args.elf, args.thres, args.times)
diff --git a/tools/bench/requirements.txt b/tools/bench/requirements.txt
new file mode 100644
index 0000000..79032e0
--- /dev/null
+++ b/tools/bench/requirements.txt
@@ -0,0 +1,4 @@
+matplotlib
+tqdm
+seaborn
+pandas
diff --git a/tools/sim/sim_app.cpp b/tools/sim/sim_app.cpp
index 02d2b4d..e1cad6b 100644
--- a/tools/sim/sim_app.cpp
+++ b/tools/sim/sim_app.cpp
@@ -1,5 +1,6 @@
 #include <CLI/CLI.hpp>
 
+#include <chrono>
 #include <filesystem>
 
 #include <fmt/core.h>
@@ -49,10 +50,15 @@ int main(int argc, const char *argv[]) try {
     return hart;
   }();
 
-  hart.dump(std::cout);
+  auto start = std::chrono::high_resolution_clock::now();
   hart.run();
+  auto end = std::chrono::high_resolution_clock::now();
   hart.dump(std::cout);
-  fmt::println("Finish execution");
+  std::chrono::duration<double> duration = end - start;
+  fmt::println("icount: {}", hart.getIcount());
+  fmt::println("time: {}s", duration.count());
+  fmt::println("threshold: {}", 0);
+  fmt::println("mips: {}", hart.getIcount() / (duration.count() * 1000000));
   return hart.getExitCode();
 } catch (const std::exception &ex) {
   fmt::println(std::cerr, "Caught an exception of type {}, message: {}",

From 9ef558271f395002438e84652179883902533e52 Mon Sep 17 00:00:00 2001
From: Andrey Derzhavin <derzhavin.andrej@yandex.ru>
Date: Fri, 31 Oct 2025 16:52:32 +0300
Subject: [PATCH 2/6] Optimize mem

---
 src/jit/base/base.cc              | 44 ++++++++++++-----------
 src/memory/include/prot/memory.hh | 38 ++++++++++++++++++++
 src/memory/plain_memory.cc        | 59 +++++++++++++++++++++++++++----
 tools/sim/sim_app.cpp             |  2 +-
 4 files changed, 115 insertions(+), 28 deletions(-)

diff --git a/src/jit/base/base.cc b/src/jit/base/base.cc
index e72f037..2a81e54 100644
--- a/src/jit/base/base.cc
+++ b/src/jit/base/base.cc
@@ -10,33 +10,35 @@ extern "C" {
 
 namespace prot::engine {
 void JitEngine::step(CPUState &cpu) {
-  if (doJIT(cpu)) {
-    return;
-  }
+  while (!cpu.finished) {
+    if (doJIT(cpu)) {
+      continue;
+    }
 
-  // colllect bb
-  auto [bbIt, wasNew] = m_cacheBB.try_emplace(cpu.getPC());
-  if (wasNew) {
-    auto curAddr = bbIt->first;
-    auto &bb = bbIt->second;
+    // colllect bb
+    auto [bbIt, wasNew] = m_cacheBB.try_emplace(cpu.getPC());
+    if (wasNew) {
+      auto curAddr = bbIt->first;
+      auto &bb = bbIt->second;
 
-    while (true) {
-      auto bytes = cpu.memory->read<isa::Word>(curAddr);
-      auto inst = isa::Instruction::decode(bytes);
-      if (!inst.has_value()) {
-        throw std::runtime_error{
-            fmt::format("Cannot decode bytes: {:#x}", bytes)};
-      }
+      while (true) {
+        auto bytes = cpu.memory->read<isa::Word>(curAddr);
+        auto inst = isa::Instruction::decode(bytes);
+        if (!inst.has_value()) {
+          throw std::runtime_error{
+              fmt::format("Cannot decode bytes: {:#x}", bytes)};
+        }
 
-      bb.insns.push_back(*inst);
-      if (isa::isTerminator(inst->opcode())) {
-        break;
+        bb.insns.push_back(*inst);
+        if (isa::isTerminator(inst->opcode())) {
+          break;
+        }
+        curAddr += isa::kWordSize;
       }
-      curAddr += isa::kWordSize;
     }
-  }
 
-  interpret(cpu, bbIt->second);
+    interpret(cpu, bbIt->second);
+  }
 }
 void JitEngine::interpret(CPUState &cpu, BBInfo &info) {
   for (const auto &insn : info.insns) {
diff --git a/src/memory/include/prot/memory.hh b/src/memory/include/prot/memory.hh
index 06fe4e5..974b9d1 100644
--- a/src/memory/include/prot/memory.hh
+++ b/src/memory/include/prot/memory.hh
@@ -28,15 +28,53 @@ struct Memory {
 
   template <std::unsigned_integral T>
   [[nodiscard]] T read(isa::Addr addr) const {
+    if constexpr (std::same_as<T, std::uint8_t>) {
+      return read8(addr);
+    }
+    if constexpr (std::same_as<T, std::uint16_t>) {
+      return read16(addr);
+    }
+    if constexpr (std::same_as<T, std::uint32_t>) {
+      return read32(addr);
+    }
+
     std::array<std::byte, sizeof(T)> buf;
     readBlock(addr, buf);
     return std::bit_cast<T>(buf);
   }
 
   template <std::unsigned_integral T> void write(isa::Addr addr, T val) {
+    if constexpr (std::same_as<T, std::uint8_t>) {
+      return write8(addr, val);
+    }
+    if constexpr (std::same_as<T, std::uint16_t>) {
+      return write16(addr, val);
+    }
+    if constexpr (std::same_as<T, std::uint32_t>) {
+      return write32(addr, val);
+    }
     const auto &buf = std::bit_cast<std::array<std::byte, sizeof(T)>>(val);
     writeBlock(buf, addr);
   }
+
+  virtual std::uint8_t read8(isa::Addr addr) const {
+    return read<std::uint8_t>(addr);
+  }
+  virtual std::uint16_t read16(isa::Addr addr) const {
+    return read<std::uint16_t>(addr);
+  }
+  virtual std::uint32_t read32(isa::Addr addr) const {
+    return read<std::uint32_t>(addr);
+  }
+  virtual void write8(isa::Addr addr, std::uint8_t val) {
+    write<std::uint8_t>(addr, val);
+  }
+  virtual void write16(isa::Addr addr, std::uint16_t val) {
+    write<std::uint16_t>(addr, val);
+  }
+  virtual void write32(isa::Addr addr, std::uint32_t val) {
+    write<std::uint32_t>(addr, val);
+  }
 };
 
 namespace memory {
diff --git a/src/memory/plain_memory.cc b/src/memory/plain_memory.cc
index 1f75c11..d69f7b6 100644
--- a/src/memory/plain_memory.cc
+++ b/src/memory/plain_memory.cc
@@ -7,26 +7,72 @@
 #include <cassert>
 #include <vector>
 
+extern "C" {
+#include <sys/mman.h>
+}
+
 namespace prot::memory {
 namespace {
 class PlainMemory : public Memory {
+  struct Unmap {
+    std::size_t m_size = 0;
+
+  public:
+    explicit Unmap(std::size_t size) noexcept : m_size(size) {}
+
+    void operator()(void *ptr) const noexcept { ::munmap(ptr, m_size); }
+  };
+
 public:
   explicit PlainMemory(std::size_t size, isa::Addr start)
-      : m_data(size), m_start(start) {
+      : m_storage(
+            [size] {
+              auto *ptr =
+                  ::mmap(NULL, size, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
+              if (ptr == MAP_FAILED) {
+                throw std::runtime_error{
+                    fmt::format("Failed to allocate {} bytes for code", size)};
+              }
+
+              return static_cast<std::byte *>(ptr);
+            }(),
+            Unmap{size}),
+        m_data(m_storage.get(), size), m_start(start) {
     if (m_data.size() + m_start < m_start) {
       throw std::invalid_argument{
           fmt::format("Size {} or start addr {:#x} is too high", size, start)};
     }
   }
 
+  std::uint8_t read8(isa::Addr addr) const override {
+    return *reinterpret_cast<const std::uint8_t *>(translateAddr(addr));
+  }
+  std::uint16_t read16(isa::Addr addr) const override {
+    return *reinterpret_cast<const std::uint16_t *>(translateAddr(addr));
+  }
+  std::uint32_t read32(isa::Addr addr) const override {
+    return *reinterpret_cast<const std::uint32_t *>(translateAddr(addr));
+  }
+
+  void write8(isa::Addr addr, std::uint8_t val) override {
+    *reinterpret_cast<std::uint8_t *>(translateAddr(addr)) = val;
+  }
+  void write16(isa::Addr addr, std::uint16_t val) override {
+    *reinterpret_cast<std::uint16_t *>(translateAddr(addr)) = val;
+  }
+  void write32(isa::Addr addr, std::uint32_t val) override {
+    *reinterpret_cast<std::uint32_t *>(translateAddr(addr)) = val;
+  }
+
   void writeBlock(std::span<const std::byte> src, isa::Addr addr) override {
-    checkRange(addr, src.size());
-    std::ranges::copy(src, translateAddr(addr));
+    // checkRange(addr, src.size());
+    std::memcpy(translateAddr(addr), src.data(), src.size());
   }
 
   void readBlock(isa::Addr addr, std::span<std::byte> dest) const override {
-    checkRange(addr, dest.size());
-    std::ranges::copy_n(translateAddr(addr), dest.size(), dest.begin());
+    // checkRange(addr, dest.size());
+    std::memcpy(dest.data(), translateAddr(addr), dest.size());
   }
 
 private:
@@ -57,7 +103,8 @@ class PlainMemory : public Memory {
     }
   }
 
-  std::vector<std::byte> m_data;
+  std::unique_ptr<std::byte, Unmap> m_storage;
+  std::span<std::byte> m_data;
   isa::Addr m_start{};
 };
 } // namespace
diff --git a/tools/sim/sim_app.cpp b/tools/sim/sim_app.cpp
index e1cad6b..2507361 100644
--- a/tools/sim/sim_app.cpp
+++ b/tools/sim/sim_app.cpp
@@ -43,7 +43,7 @@ int main(int argc, const char *argv[]) try {
         !jitBackend.empty() ? prot::engine::JitFactory::createEngine(jitBackend)
                             : std::make_unique<prot::engine::Interpreter>();
 
-    prot::Hart hart{prot::memory::makePaged(12), std::move(engine)};
+    prot::Hart hart{prot::memory::makePlain(4ULL << 30U), std::move(engine)};
     hart.load(loader);
     hart.setSP(stackTop);
 

From 8a70912671301677d24414c4fa0a08cf67ac14c6 Mon Sep 17 00:00:00 2001
From: Andrey Derzhavin <derzhavin.andrej@yandex.ru>
Date: Fri, 31 Oct 2025 20:36:00 +0300
Subject: [PATCH 3/6] Optimize JIT translation

---
 src/jit/asmjit/asmjit.cc              | 27 +----------
 src/jit/base/base.cc                  | 20 ++++++---
 src/jit/base/include/prot/jit/base.hh | 43 +++++++++++++++---
 src/jit/lightning/lightning.cc        | 33 ++------------
 src/jit/llvm/llvmbasedjit.cc          | 64 +++++++++++----------------
 src/jit/mir/mir.cc                    | 30 +------------
 src/jit/xbyak/xbyak.cc                | 33 +++-----------
 7 files changed, 91 insertions(+), 159 deletions(-)

diff --git a/src/jit/asmjit/asmjit.cc b/src/jit/asmjit/asmjit.cc
index db3356d..a9739bd 100644
--- a/src/jit/asmjit/asmjit.cc
+++ b/src/jit/asmjit/asmjit.cc
@@ -119,34 +119,9 @@ class AsmJit : public JitEngine {
   AsmJit() = default;
 
 private:
-  bool doJIT(CPUState &state) override {
-    const auto pc = state.getPC();
-    auto found = m_cacheTB.find(pc);
-    if (found != m_cacheTB.end()) {
-      found->second(state);
-      return true;
-    }
-
-    const auto *bbInfo = getBBInfo(pc);
-    if (bbInfo == nullptr) {
-      // No such bb yet
-      return false;
-    }
-
-    auto holder = translate(*bbInfo);
-
-    auto [it, wasNew] = m_cacheTB.try_emplace(pc, std::move(holder));
-    assert(wasNew);
-
-    it->second(state);
-
-    return true;
-  }
-
-  [[nodiscard]] JitFunction translate(const BBInfo &info);
+  [[nodiscard]] JitFunction translate(const BBInfo &info) override;
 
   asmjit::JitRuntime runtime;
-  std::unordered_map<isa::Addr, JitFunction> m_cacheTB;
 };
 
 template <typename T> void storeHelper(CPUState &state, isa::Addr addr, T val) {
diff --git a/src/jit/base/base.cc b/src/jit/base/base.cc
index 2a81e54..2d82b20 100644
--- a/src/jit/base/base.cc
+++ b/src/jit/base/base.cc
@@ -10,14 +10,17 @@ extern "C" {
 
 namespace prot::engine {
 void JitEngine::step(CPUState &cpu) {
-  while (!cpu.finished) {
-    if (doJIT(cpu)) {
+  while (!cpu.finished) [[likely]] {
+    // colllect bb
+    const auto pc = cpu.getPC();
+    auto found = m_tbCache.lookup(pc);
+    if (found != nullptr) [[likely]] {
+      found(cpu);
       continue;
     }
 
-    // colllect bb
-    auto [bbIt, wasNew] = m_cacheBB.try_emplace(cpu.getPC());
-    if (wasNew) {
+    auto [bbIt, wasNew] = m_cacheBB.try_emplace(pc);
+    if (wasNew) [[unlikely]] {
       auto curAddr = bbIt->first;
       auto &bb = bbIt->second;
 
@@ -35,6 +38,13 @@ void JitEngine::step(CPUState &cpu) {
         }
         curAddr += isa::kWordSize;
       }
+    } else if (bbIt->second.num_exec >= kExecThreshold) [[likely]] {
+      auto code = translate(bbIt->second);
+      m_tbCache.insert(pc, code);
+      if (code != nullptr) [[likely]] {
+        code(cpu);
+        continue;
+      }
     }
 
     interpret(cpu, bbIt->second);
diff --git a/src/jit/base/include/prot/jit/base.hh b/src/jit/base/include/prot/jit/base.hh
index 43e475f..460a6d9 100644
--- a/src/jit/base/include/prot/jit/base.hh
+++ b/src/jit/base/include/prot/jit/base.hh
@@ -4,17 +4,49 @@
 #include "prot/interpreter.hh"
 
 #include <functional>
+#include <map>
 #include <unordered_map>
 #include <vector>
 
 namespace prot::engine {
+using JitFunction = void (*)(CPUState &);
 class JitEngine : public Interpreter {
-  static constexpr std::size_t kExecThreshold = 0;
+  static constexpr std::size_t kExecThreshold = 10;
 
 public:
   void step(CPUState &cpu) override;
 
 protected:
+  struct TbCache {
+    static constexpr std::uint64_t kInvalidAddr{0};
+    static constexpr std::uint64_t kSizeLog2{22};
+    static constexpr std::uint64_t kSize{1ULL << kSizeLog2};
+    static constexpr std::uint64_t kGpaGranularityLog2{2};
+
+    struct Entry {
+      JitFunction func{};
+      std::uint32_t gpa{};
+    };
+
+    JitFunction lookup(std::uint32_t gpa) const {
+      const auto &entry = get(gpa);
+      return entry.gpa == gpa ? entry.func : nullptr;
+    }
+    void insert(std::uint32_t gpa, JitFunction func) {
+      get(gpa) = Entry{.func = func, .gpa = gpa};
+    }
+
+  private:
+    const Entry &get(std::uint32_t gpa) const { return m_cache[getHash(gpa)]; }
+    Entry &get(std::uint32_t gpa) { return m_cache[getHash(gpa)]; }
+
+    [[nodiscard]] static constexpr std::uint32_t getHash(std::uint32_t gpa) {
+      return (gpa >> kGpaGranularityLog2) & (kSize - 1);
+    }
+
+    std::array<Entry, kSize> m_cache;
+  };
+
   // simple bb counting
   struct BBInfo final {
     std::vector<isa::Instruction> insns;
@@ -29,19 +61,20 @@ private:
   }
 
 private:
-  virtual bool doJIT(CPUState &state) = 0;
+  [[nodiscard]] virtual JitFunction translate(const BBInfo &info) = 0;
 
+  TbCache m_tbCache;
   std::unordered_map<isa::Addr, BBInfo> m_cacheBB;
 };
 
 class CachedInterpreter final : public JitEngine {
-  bool doJIT([[maybe_unused]] CPUState &stat) override { return false; }
+  JitFunction translate(const BBInfo & /* unused */) override {
+    return nullptr;
+  }
 };
 
 // Helper class to store JITed code
 // Especially helpful for libraries w/out propper mem pool support
-
-using JitFunction = void (*)(CPUState &);
 class CodeHolder final {
   struct Unmap {
     std::size_t m_size = 0;
diff --git a/src/jit/lightning/lightning.cc b/src/jit/lightning/lightning.cc
index 23c403c..b2b2ea0 100644
--- a/src/jit/lightning/lightning.cc
+++ b/src/jit/lightning/lightning.cc
@@ -18,35 +18,12 @@ namespace {
 struct Lightning : public JitEngine {
   Lightning() { init_jit("JIT Research"); }
 
-  bool doJIT(CPUState &state) override {
-    const auto pc = state.getPC();
-    auto found = m_cacheTB.find(pc);
-    if (found != m_cacheTB.end()) {
-      found->second(state);
-      return true;
-    }
-
-    const auto *bbInfo = getBBInfo(pc);
-    if (bbInfo == nullptr) {
-      // No such bb yet
-      return false;
-    }
-
-    auto code = translate(*bbInfo);
-    auto [it, wasNew] = m_cacheTB.emplace(pc, std::move(code));
-    assert(wasNew);
-
-    it->second(state);
-
-    return true;
-  }
-
-  [[nodiscard]] CodeHolder translate(const BBInfo &info);
+  [[nodiscard]] JitFunction translate(const BBInfo &info) override;
 
   ~Lightning() override { finish_jit(); }
 
 private:
-  std::unordered_map<isa::Addr, CodeHolder> m_cacheTB;
+  std::vector<CodeHolder> m_holders;
 };
 
 void storeHelper(CPUState &state, isa::Addr addr,
@@ -86,7 +63,7 @@ class JITStateHolder final {
   std::unique_ptr<jit_state_t, Deleter> m_ptr;
 };
 
-CodeHolder Lightning::translate(const BBInfo &info) {
+JitFunction Lightning::translate(const BBInfo &info) {
   JITStateHolder holder;
   jit_state_t *_jit = holder.get();
 
@@ -331,13 +308,11 @@ CodeHolder Lightning::translate(const BBInfo &info) {
   jit_stxi_i(offsetof(CPUState, icount), JIT_V0, JIT_R0);
   jit_epilog();
 
-  auto code = std::move(holder).emit();
-
   // fmt::println("CODE!!");
   // jit_disassemble();
   // fmt::println("CODE END");
 
-  return code;
+  return m_holders.emplace_back(std::move(holder).emit()).as<JitFunction>();
 }
 } // namespace
 
diff --git a/src/jit/llvm/llvmbasedjit.cc b/src/jit/llvm/llvmbasedjit.cc
index fd51869..2db85ec 100644
--- a/src/jit/llvm/llvmbasedjit.cc
+++ b/src/jit/llvm/llvmbasedjit.cc
@@ -50,7 +50,7 @@ llvm::Type *getCPUStateType(llvm::LLVMContext &Ctx) {
   llvm::ArrayType *regsArrayType = llvm::ArrayType::get(wordType, 32);
 
   std::vector<llvm::Type *> structMemberTypes = {
-      regsArrayType, pcType, finishedType, memoryPtrType, icountType};
+      regsArrayType, pcType, finishedType, memoryPtrType, icountType, wordType};
 
   llvm::StructType *cpuStateType = llvm::StructType::create(
       Ctx, structMemberTypes, "CPUState", /*IsPacked=*/false);
@@ -98,7 +98,7 @@ void doSyscall(CPUState &state) { state.emulateSysCall(); }
 
 class LLVMBasedJIT : public JitEngine {
   std::unique_ptr<llvm::orc::LLJIT> m_jit;
-  std::unordered_map<isa::Word, llvm::orc::ExecutorAddr> m_cache;
+  std::size_t m_moduleId{};
 
   using TBFunc = void (*)(CPUState &);
 
@@ -106,43 +106,17 @@ class LLVMBasedJIT : public JitEngine {
   LLVMBasedJIT(std::unique_ptr<llvm::orc::LLJIT> JIT);
 
 private:
-  bool doJIT(CPUState &state) override {
-    const auto pc = state.getPC();
-    llvm::orc::ExecutorAddr found;
-    auto it = m_cache.find(pc);
-    if (it != m_cache.end()) {
-      found = it->second;
-      std::invoke(found.toPtr<TBFunc>(), state);
-      const auto *bbInfo = getBBInfo(pc);
-      state.icount += bbInfo->insns.size();
-      return true;
-    }
-
-    const auto *bbInfo = getBBInfo(pc);
-    if (bbInfo == nullptr) {
-      return false;
-    }
-
-    auto [module, ctx] = translate(pc, *bbInfo);
+  JitFunction translate(const BBInfo &info) override {
+    auto &&[module, ctx] = doTranslate(info);
     llvm::orc::ThreadSafeModule tsm(std::move(module), std::move(ctx));
     tsm = optimizeIRModule(std::move(tsm));
     auto err = m_jit->addIRModule(std::move(tsm));
-    if (err) {
-      return false;
-    }
-
-    bool hasFound;
-    std::tie(it, hasFound) =
-        m_cache.insert({pc, *m_jit->lookup(std::to_string(pc))});
-    found = it->second;
-    assert(found);
-    std::invoke(found.toPtr<TBFunc>(), state);
-    state.icount += bbInfo->insns.size();
-    return true;
+    assert(!err);
+    return *m_jit->lookup(std::to_string(m_moduleId++))->toPtr<JitFunction>();
   }
 
   std::pair<std::unique_ptr<llvm::Module>, std::unique_ptr<llvm::LLVMContext>>
-  translate(isa::Word pc, const BBInfo &info);
+  doTranslate(const BBInfo &info);
   static llvm::orc::ThreadSafeModule
   optimizeIRModule(llvm::orc::ThreadSafeModule TSM);
 };
@@ -180,16 +154,17 @@ llvm::Function *getOrDeclareMemoryFunction(llvm::Module &M,
 }
 
 std::pair<std::unique_ptr<llvm::Module>, std::unique_ptr<llvm::LLVMContext>>
-LLVMBasedJIT::translate(const isa::Word pc, const BBInfo &info) {
+LLVMBasedJIT::doTranslate(const BBInfo &info) {
+  const auto &name = std::to_string(m_moduleId);
   auto ctxPtr = std::make_unique<llvm::LLVMContext>();
-  auto modulePtr = std::make_unique<llvm::Module>(std::to_string(pc), *ctxPtr);
+  auto modulePtr = std::make_unique<llvm::Module>(name, *ctxPtr);
   llvm::IRBuilder<> builder{*ctxPtr};
 
   auto *fnTy = llvm::FunctionType::get(
       llvm::Type::getVoidTy(*ctxPtr),
       {llvm::PointerType::getUnqual(getCPUStateType(*ctxPtr))}, false);
-  auto *fn = llvm::Function::Create(fnTy, llvm::Function::ExternalLinkage,
-                                    std::to_string(pc), *modulePtr);
+  auto *fn = llvm::Function::Create(fnTy, llvm::Function::ExternalLinkage, name,
+                                    *modulePtr);
 
   llvm::orc::MangleAndInterner mangle(m_jit->getExecutionSession(),
                                       m_jit->getDataLayout());
@@ -218,8 +193,21 @@ LLVMBasedJIT::translate(const isa::Word pc, const BBInfo &info) {
       }};
   llvm::BasicBlock *entryBB = llvm::BasicBlock::Create(*ctxPtr, "entry", fn);
   builder.SetInsertPoint(entryBB);
-
   std::ranges::for_each(info.insns, std::bind_front(buildInstruction, data));
+  auto *icountType =
+      llvm::IntegerType::get(*ctxPtr, sizeofBits<decltype(CPUState::icount)>());
+  auto *addend = llvm::ConstantInt::get(icountType, info.insns.size());
+  auto *cpuStructTy = getCPUStateType(data.Builder.getContext());
+
+  auto *cpuArg = data.CurrentFunction->getArg(0);
+
+  llvm::Value *icPtr = data.Builder.CreateStructGEP(cpuStructTy, cpuArg, 4);
+  auto *icVal = data.Builder.CreateLoad(icountType, icPtr);
+  auto *newVal = data.Builder.CreateAdd(icVal, addend);
+  data.Builder.CreateStore(newVal, icPtr);
+
+  // data.Module.dump();
+  // throw 1;
   data.Builder.CreateRetVoid();
 
   return std::make_pair(std::move(modulePtr), std::move(ctxPtr));
diff --git a/src/jit/mir/mir.cc b/src/jit/mir/mir.cc
index 6421e5d..518ecc0 100644
--- a/src/jit/mir/mir.cc
+++ b/src/jit/mir/mir.cc
@@ -172,32 +172,9 @@ class MIRJit : public JitEngine {
   }
 
 private:
-  bool doJIT(CPUState &state) override {
-    const auto pc = state.getPC();
-    auto found = m_cacheTB.find(pc);
-    if (found != m_cacheTB.end()) {
-      found->second(state);
-      return true;
-    }
-
-    const auto *bbInfo = getBBInfo(pc);
-    if (bbInfo == nullptr) {
-      return false;
-    }
-
-    auto func = translate(*bbInfo);
-    auto [it, wasNew] = m_cacheTB.try_emplace(pc, func);
-    assert(wasNew);
-
-    it->second(state);
-
-    return true;
-  }
-
-  [[nodiscard]] JitFunction translate(const BBInfo &info);
+  [[nodiscard]] JitFunction translate(const BBInfo &info) override;
 
   MIR_context_t ctx;
-  std::unordered_map<isa::Addr, JitFunction> m_cacheTB;
   std::unordered_map<std::string, MIR_item_t> m_func_proto{};
 };
 
@@ -413,10 +390,7 @@ JitFunction MIRJit::translate(const BBInfo &info) {
 
   MIR_link(ctx, MIR_set_gen_interface, nullptr);
 
-  JitFunction compiled_func =
-      reinterpret_cast<JitFunction>(MIR_gen(ctx, func_item));
-
-  return compiled_func;
+  return reinterpret_cast<JitFunction>(MIR_gen(ctx, func_item));
 }
 
 } // namespace
diff --git a/src/jit/xbyak/xbyak.cc b/src/jit/xbyak/xbyak.cc
index 26d55ea..18520da 100644
--- a/src/jit/xbyak/xbyak.cc
+++ b/src/jit/xbyak/xbyak.cc
@@ -20,32 +20,9 @@ class XByakJit : public JitEngine, private Xbyak::CodeGenerator {
       : Xbyak::CodeGenerator{Xbyak::DEFAULT_MAX_CODE_SIZE, Xbyak::AutoGrow} {}
 
 private:
-  bool doJIT(CPUState &state) override {
-    const auto pc = state.getPC();
-    auto found = m_cacheTB.find(pc);
-    if (found != m_cacheTB.end()) {
-      found->second(state);
-      return true;
-    }
-
-    const auto *bbInfo = getBBInfo(pc);
-    if (bbInfo == nullptr) {
-      // No such bb yet
-      return false;
-    }
-
-    auto holder = translate(*bbInfo);
-
-    auto [it, wasNew] = m_cacheTB.try_emplace(pc, std::move(holder));
-    assert(wasNew);
+  [[nodiscard]] JitFunction translate(const BBInfo &info) override;
 
-    it->second(state);
-
-    return true;
-  }
-
-  [[nodiscard]] CodeHolder translate(const BBInfo &info);
-  std::unordered_map<isa::Addr, CodeHolder> m_cacheTB;
+  std::vector<CodeHolder> m_holders;
 };
 
 void storeHelper(CPUState &state, isa::Addr addr,
@@ -59,7 +36,7 @@ template <typename T> T loadHelper(CPUState &state, isa::Addr addr) {
 
 void syscallHelper(CPUState &state) { state.emulateSysCall(); }
 
-CodeHolder XByakJit::translate(const BBInfo &info) {
+JitFunction XByakJit::translate(const BBInfo &info) {
   reset(); // XByak specific (CodeGenerator is about a PAGE size!!, so reuse it)
   Xbyak::util::StackFrame frame{this, 3, 3 | Xbyak::util::UseRCX};
 
@@ -306,9 +283,9 @@ CodeHolder XByakJit::translate(const BBInfo &info) {
 
   frame.close();
   ready();
-
   // Copy data to holder
-  return CodeHolder{std::as_bytes(std::span{getCode(), getSize()})};
+  return m_holders.emplace_back(std::as_bytes(std::span{getCode(), getSize()}))
+      .as<JitFunction>();
 } // namespace
 } // namespace
 

From 982d51bdcaed9ce6e3cbd56a09cc38474d3c5da7 Mon Sep 17 00:00:00 2001
From: Andrey Derzhavin <derzhavin.andrej@yandex.ru>
Date: Fri, 31 Oct 2025 20:50:44 +0300
Subject: [PATCH 4/6] Add lfs

---
 .gitattributes | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..1af7c3f
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+dataset1.h filter=lfs diff=lfs merge=lfs -text

From 4752d0df66639c2907c031371995eb8c92e1d56a Mon Sep 17 00:00:00 2001
From: Andrey Derzhavin <derzhavin.andrej@yandex.ru>
Date: Fri, 31 Oct 2025 21:14:37 +0300
Subject: [PATCH 5/6] Use lfs

---
 benchmarks/median/dataset1.h   | 3 +++
 benchmarks/multiply/dataset1.h | 3 +++
 benchmarks/qsort/dataset1.h    | 3 +++
 benchmarks/rsort/dataset1.h    | 3 +++
 benchmarks/vvadd/dataset1.h    | 3 +++
 5 files changed, 15 insertions(+)
 create mode 100644 benchmarks/median/dataset1.h
 create mode 100644 benchmarks/multiply/dataset1.h
 create mode 100644 benchmarks/qsort/dataset1.h
 create mode 100644 benchmarks/rsort/dataset1.h
 create mode 100644 benchmarks/vvadd/dataset1.h

diff --git a/benchmarks/median/dataset1.h b/benchmarks/median/dataset1.h
new file mode 100644
index 0000000..a90effb
--- /dev/null
+++ b/benchmarks/median/dataset1.h
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c48e2de2d791af38684f82f7e646f5e1d2c1c17b83339d5ea616e46741c6707
+size 51500096
diff --git a/benchmarks/multiply/dataset1.h b/benchmarks/multiply/dataset1.h
new file mode 100644
index 0000000..87e7fa7
--- /dev/null
+++ b/benchmarks/multiply/dataset1.h
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9d18dfb4ff15d36c754507eea1abed70b78212a1741615bbfd84ab016d58403
+size 90261269
diff --git a/benchmarks/qsort/dataset1.h b/benchmarks/qsort/dataset1.h
new file mode 100644
index 0000000..f2fea5f
--- /dev/null
+++ b/benchmarks/qsort/dataset1.h
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a038a080b9ea99372f25dfce4317ac8a6630e930fd10360feb3433db00297813
+size 11632993
diff --git a/benchmarks/rsort/dataset1.h b/benchmarks/rsort/dataset1.h
new file mode 100644
index 0000000..f2fea5f
--- /dev/null
+++ b/benchmarks/rsort/dataset1.h
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a038a080b9ea99372f25dfce4317ac8a6630e930fd10360feb3433db00297813
+size 11632993
diff --git a/benchmarks/vvadd/dataset1.h b/benchmarks/vvadd/dataset1.h
new file mode 100644
index 0000000..5f74dab
--- /dev/null
+++ b/benchmarks/vvadd/dataset1.h
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b606421d236bd62d343919299c76b03d02c10c0e3af7473a11a585aaa6391571
+size 79744342

From 94c90a5c18ffafdc2751079f27c900f7813b5357 Mon Sep 17 00:00:00 2001
From: Andrey Derzhavin <derzhavin.andrej@yandex.ru>
Date: Fri, 31 Oct 2025 21:35:14 +0300
Subject: [PATCH 6/6] Use Gmean

---
 tools/bench/DoBenchmark.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tools/bench/DoBenchmark.py b/tools/bench/DoBenchmark.py
index 056bf06..478ff5f 100644
--- a/tools/bench/DoBenchmark.py
+++ b/tools/bench/DoBenchmark.py
@@ -42,19 +42,19 @@ def build_hist(list_data: list[list[BenchmarkList]], bench_names: list[str]):
     for j in range(num_bars_per_group):
         for i in range(num_groups):
             data["Benchmark"].append(bench_names[i])
-            data["MIPS"].append(np.mean(list_data[i][j].mips()))
+            data["MIPS"].append(list_data[i][j].mips())
             data["Backend"].append(list_data[i][j].name)
 
     df = pd.DataFrame(data)
     print(df)
-    filtered_df = df[~df["Backend"].isin(["interp", "cached interp"])]
+    filtered_df = df[~df["Backend"].isin(["nopbench"])]
     avg_df = (
-        filtered_df.groupby("Benchmark")
+        filtered_df.groupby("Backend")
         .agg(
-            MIPS=("MIPS", "mean"),
+            MIPS=("MIPS", statistics.geometric_mean),
         )
         .reset_index()
-        .assign(Backend="averaged-jit")
+        .assign(Benchmark="geomean")
     )
 
     final_df = pd.concat([df, avg_df], ignore_index=True)
@@ -125,8 +125,16 @@ def benchmark_name(elf_path: Path):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Tool for benchmarking")
     parser.add_argument("sim_bin", type=Path, help="Path to sim binary")
-    parser.add_argument("elf", type=Path, nargs="+", help="Path(s) to benchmarks")
-    parser.add_argument("-t", "--times", type=int, default=2, help="Amount of times to run each bench")
+    parser.add_argument(
+        "elf", type=Path, nargs="+", help="Path(s) to benchmarks"
+    )
+    parser.add_argument(
+        "-t",
+        "--times",
+        type=int,
+        default=2,
+        help="Amount of times to run each bench",
+    )
     parser.add_argument("--thres", type=int, default=0, help="JIT threshold")
 
     args = parser.parse_args()