Adding I-cache

acheron2302 · acheron2302 · commit adf69f68a6d7 · 2025-08-31T23:59:55.000+07:00
diff --git a/crates/libmwemu/src/emu/disassemble.rs b/crates/libmwemu/src/emu/disassemble.rs
@@ -1,7 +1,201 @@
 use iced_x86::{Decoder, DecoderOptions, Formatter as _, Instruction};
-
+use serde::{Deserialize, Serialize};
 use crate::emu::Emu;
 
+// about 10 mb should be on l3 cache
+// 8192 cache lines,
+// 32 instructions for each one,
+// 40 for the struct (I think we can make it smaller)
+const INSTRUCTION_ARRAY_SIZE: usize = 8192 * 32;
+
+// we want the cache size to be store in L1 cache which is lower than 40kb
+const CACHE_SIZE: usize = 2048 * 16;
+const CACHE_MASK: usize = CACHE_SIZE - 1; // Assumes power of 2
+const MAX_CACHE_PER_LINE: usize = 16;
+pub const INVALID_LPF_ADDR: u64 = 0xffffffffffffffff;
+
+pub fn LPF_OF(addr: u64) -> u64 {
+    // Implementation of LPF_OF macro/function
+    addr & 0xfffffffffffff000
+}
+
+#[derive(Clone, Serialize, Deserialize)]
+struct CachedInstruction {
+    pub lpf: u64,
+    pub instruction_key : usize,
+    pub instruction_len: usize,
+}
+
+impl Default for CachedInstruction {
+    fn default() -> Self {
+        CachedInstruction {
+            lpf: INVALID_LPF_ADDR,
+            instruction_key: 0x0,
+            instruction_len: 0x0,
+        }
+    }
+}
+
+impl CachedInstruction {
+    pub fn is_valid(&self) -> bool {
+        self.lpf == INVALID_LPF_ADDR
+    }
+}
+
+#[derive(Clone, Serialize, Deserialize)]
+pub struct InstructionCache {
+    cache_entries: Vec<CachedInstruction>,
+    instructions: Vec<Instruction>,
+    next_instruction_slot: usize,
+    pub current_instruction_slot: usize,
+    current_decode_len: usize,
+    current_decode_idx: usize
+    // probe_stats: ProbeStats,
+}
+
+#[derive(Clone, Serialize, Deserialize, Default)]
+struct ProbeStats {
+    hits: usize,
+    misses: usize,
+    collisions: usize,
+}
+
+impl InstructionCache {
+    pub fn new() -> Self {
+        let mut cache = InstructionCache {
+            cache_entries: vec![CachedInstruction::default(); CACHE_SIZE],
+            instructions: vec![Instruction::default(); INSTRUCTION_ARRAY_SIZE],
+            next_instruction_slot: 0,
+            current_decode_len: 0,
+            current_instruction_slot: 0,
+            current_decode_idx: 0,
+            // probe_stats: ProbeStats::default(),
+        };
+
+        // Initialize all instructions to default state
+        for inst in &mut cache.instructions {
+            *inst = Instruction::default();
+        }
+
+        cache
+    }
+
+    #[inline(always)]
+    pub fn get_index_of(&self, lpf: u64, len: u64) -> usize {
+        const TLB_MASK: u32 = ((CACHE_SIZE - 1) << 12) as u32;
+        (((lpf + len) & (TLB_MASK as u64)) >> 12) as usize
+    }
+
+    #[inline]
+    fn flush_cache_line(&mut self, idx: usize) {
+        for i in 0..MAX_CACHE_PER_LINE {
+            self.cache_entries[idx].lpf = INVALID_LPF_ADDR;
+        }
+    }
+
+    pub fn lookup_entry(&mut self, addr: u64, len: u64) -> bool {
+        let lpf = crate::maps::tlb::LPF_OF(addr);
+        let idx = self.get_index_of(lpf, len);
+
+        // do a linear probing for each cache line
+        for i in 0..MAX_CACHE_PER_LINE {
+            if self.cache_entries[idx+i].lpf == INVALID_LPF_ADDR {
+                return false;
+            }
+            // found the instruction now do initialization and return true
+            if self.cache_entries[idx+i].lpf == addr {
+                let key = self.cache_entries[idx+i].instruction_key;
+                self.current_instruction_slot = key;
+                self.current_decode_len = self.cache_entries[idx+i].instruction_len;
+                self.current_decode_idx = 0;
+                return true;
+            }
+        }
+
+        // the cache_line is full now we flush all the cache line
+        self.flush_cache_line(idx);
+        true
+    }
+
+    #[inline(always)]
+    fn flush_cache(&mut self) {
+        self.cache_entries = vec![CachedInstruction::default(); CACHE_SIZE];
+        self.instructions = vec![Instruction::default(); INSTRUCTION_ARRAY_SIZE];
+        self.next_instruction_slot = 0;
+    }
+
+    pub fn insert_from_decoder(&mut self, decoder: &mut Decoder, addition: usize, rip_addr: u64) {
+        let lpf = crate::maps::tlb::LPF_OF(rip_addr);
+        let idx = self.get_index_of(lpf, 0);
+
+        // copy the instruction to the slot
+        // now the case when instruction slot is full, instead of complex algorithm
+        // we just fudge everything and rebuild from scratch can be a better way
+        // but I think this is simple and good enough
+        let slot = self.next_instruction_slot;
+
+        if self.next_instruction_slot >= INSTRUCTION_ARRAY_SIZE {
+            self.flush_cache();
+        }
+        let mut count: usize = 0;
+        while decoder.can_decode() && decoder.position() + addition <= decoder.max_position() {
+            decoder.decode_out(&mut self.instructions[slot+count]);
+            count += 1;
+        }
+        self.next_instruction_slot += count;
+
+        // insert to the cache
+        for i in 0..MAX_CACHE_PER_LINE {
+            if self.cache_entries[idx+i].lpf == INVALID_LPF_ADDR {
+                self.cache_entries[idx+i].instruction_key = slot;
+                self.cache_entries[idx+i].lpf = rip_addr;
+                self.cache_entries[idx+i].instruction_len = count;
+                break;
+            }
+        }
+
+        assert!(self.lookup_entry(rip_addr, 0), "Cache Insertion FAILED: There is support to be entry after insertion using insert_from_decoder");
+    }
+
+    pub fn insert_instruction(&mut self, addr: u64, instrs: Vec<Instruction>) {
+        let lpf = crate::maps::tlb::LPF_OF(addr);
+        let idx = self.get_index_of(lpf, 0);
+
+        // copy the instruction to the slot
+        // now the case when instruction slot is full, instead of complex algorithm
+        // we just fudge everything and rebuild from scratch can be a better way
+        // but I think this is simple and good enough
+        let slot = self.next_instruction_slot;
+        self.next_instruction_slot += instrs.len();
+        if self.next_instruction_slot >= INSTRUCTION_ARRAY_SIZE {
+            self.flush_cache();
+        }
+
+        for i in 0..instrs.len() {
+            self.instructions[slot+i] = instrs[i];
+        }
+
+        // insert to the cache
+        for i in 0..MAX_CACHE_PER_LINE {
+            if self.cache_entries[idx+i].lpf == INVALID_LPF_ADDR {
+                self.cache_entries[idx+i].instruction_key = slot;
+                self.cache_entries[idx+i].lpf = addr;
+                self.cache_entries[idx+i].instruction_len = instrs.len();
+                break;
+            }
+        }
+    }
+
+    pub fn decode_out(&mut self, instruction: &mut Instruction) {
+        *instruction = self.instructions[self.current_instruction_slot + self.current_decode_idx];
+        self.current_decode_idx += 1;
+    }
+
+    pub fn can_decode(&self) -> bool {
+        self.current_decode_idx < self.current_decode_len
+    }
+}
+
 impl Emu {
     /// Disassemble an amount of instruccions on an specified address.
     /// This not used on the emulation engine, just from console, 
diff --git a/crates/libmwemu/src/emu/execution.rs b/crates/libmwemu/src/emu/execution.rs
@@ -7,6 +7,7 @@ use crate::console::Console;
 use crate::emu::Emu;
 use crate::err::MwemuError;
 use crate::{constants, engine, serialization};
+use crate::emu::disassemble::InstructionCache;
 
 impl Emu {
     #[inline]
@@ -358,6 +359,8 @@ impl Emu {
     /// Automatically dispatches to single or multi-threaded execution based on cfg.enable_threading.
     #[allow(deprecated)]
     pub fn run(&mut self, end_addr: Option<u64>) -> Result<u64, MwemuError> {
+        let mut instruction_cache = InstructionCache::new();
+        self.instruction_cache = instruction_cache;
         if self.cfg.enable_threading && self.threads.len() > 1 {
             self.run_multi_threaded(end_addr)
         } else {
@@ -831,6 +834,7 @@ impl Emu {
         // the need of Reallocate everytime
         let mut block: Vec<u8> = Vec::with_capacity(constants::BLOCK_LEN + 1);
         block.resize(constants::BLOCK_LEN, 0x0);
+        let mut instruction_cache = InstructionCache::new();
         loop {
             while self.is_running.load(atomic::Ordering::Relaxed) == 1 {
                 //log::info!("reloading rip 0x{:x}", self.regs().rip);
@@ -848,28 +852,32 @@ impl Emu {
                     }
                 };
 
-                // we just need to read 0x300 bytes because x86 require that the instruction is 16 bytes long
-                // reading anymore would be a waste of time
-                let block_sz = constants::BLOCK_LEN;
-                let block_temp = code.read_bytes(rip, block_sz);
-                let block_temp_len = block_temp.len();
-                if block_temp_len != block.len() {
-                    block.resize(block_temp_len, 0);
+                if !self.instruction_cache.lookup_entry(rip, 0) {
+                    // we just need to read 0x300 bytes because x86 require that the instruction is 16 bytes long
+                    // reading anymore would be a waste of time
+                    let block_sz = constants::BLOCK_LEN;
+                    let block_temp = code.read_bytes(rip, block_sz);
+                    let block_temp_len = block_temp.len();
+                    if block_temp_len != block.len() {
+                        block.resize(block_temp_len, 0);
+                    }
+                    block.clone_from_slice(block_temp);
+                    if block.len() == 0 {
+                        return Err(MwemuError::new("cannot read code block, weird address."));
+                    }
+                    let mut decoder =
+                        Decoder::with_ip(arch, &block, self.regs().rip, DecoderOptions::NONE);
+
+                    self.rep = None;
+                    let addition = if block_temp_len < 16 {block_temp_len} else {16};
+                    self.instruction_cache.insert_from_decoder(&mut decoder, addition, rip);
                 }
-                block.clone_from_slice(block_temp);
-                if block.len() == 0 {
-                     return Err(MwemuError::new("cannot read code block, weird address."));
-                } 
-                let mut decoder =
-                    Decoder::with_ip(arch, &block, self.regs().rip, DecoderOptions::NONE);
-                let mut sz: usize = 0;
-                let mut addr: u64 = 0;
 
-                self.rep = None;
-                let addition = if block_temp_len < 16 {block_temp_len} else {16};
-                while decoder.can_decode() && (decoder.position() + addition <= decoder.max_position()) {
+                let mut sz = 0;
+                let mut addr = 0;
+                while self.instruction_cache.can_decode() {
                     if self.rep.is_none() {
-                        decoder.decode_out(&mut ins);
+                        self.instruction_cache.decode_out(&mut ins);
                         sz = ins.len();
                         addr = ins.ip();
 
@@ -883,7 +891,7 @@ impl Emu {
                     }
 
                     self.instruction = Some(ins);
-                    self.decoder_position = decoder.position();
+                    self.decoder_position = self.instruction_cache.current_instruction_slot;
                     self.memory_operations.clear();
                     self.pos += 1;
 
diff --git a/crates/libmwemu/src/emu/mod.rs b/crates/libmwemu/src/emu/mod.rs
@@ -3,6 +3,7 @@ use std::{cell::RefCell, fs::File, sync::{atomic::AtomicU32, Arc}, time::Instant
 use iced_x86::{Instruction, IntelFormatter};
 
 use crate::{banzai::Banzai, breakpoint::Breakpoints, colors::Colors, config::Config, global_locks::GlobalLocks, hooks::Hooks, maps::Maps, pe::pe32::PE32, pe::pe64::PE64, structures::MemoryOperation, thread_context::ThreadContext};
+use crate::emu::disassemble::InstructionCache;
 
 mod operands;
 mod display;
@@ -78,4 +79,5 @@ pub struct Emu {
     pub threads: Vec<ThreadContext>,
     pub current_thread_id: usize,  // Index into threads vec
     pub global_locks: GlobalLocks,  // Critical section lock tracking
+    instruction_cache: InstructionCache
 }
diff --git a/crates/libmwemu/src/emu/operands.rs b/crates/libmwemu/src/emu/operands.rs
@@ -457,6 +457,9 @@ impl Emu {
                     0
                 };
 
+                // now we flush the cacheline if it is written to executable memory and the cacheline exist
+                let mem1 = self.maps.get_mem_by_addr(mem_addr).expect("The memory doesn't exists");
+
                 match sz {
                     64 => {
                         if !self.maps.write_qword(mem_addr, value2) {
diff --git a/crates/libmwemu/src/maps/mem64.rs b/crates/libmwemu/src/maps/mem64.rs
@@ -13,11 +13,14 @@ use std::io::Write;
 use bytemuck::cast_slice;
 use crate::emu_context;
 
+
+
 #[derive(Clone, Serialize, Deserialize)]
 pub struct Mem64 {
     mem_name: String,
     base_addr: u64,
     bottom_addr: u64,
+    permission:
     mem: Vec<u8>,
 }
 

Original file line number	Diff line number	Diff line change
`@@ -457,6 +457,9 @@ impl Emu {`
`457`	`457`	`0`
`458`	`458`	`};`
`459`	`459`
	`460`	`+ // now we flush the cacheline if it is written to executable memory and the cacheline exist`
	`461`	`+ let mem1 = self.maps.get_mem_by_addr(mem_addr).expect("The memory doesn't exists");`
	`462`	`+`
`460`	`463`	`match sz {`
`461`	`464`	`64 => {`
`462`	`465`	`if !self.maps.write_qword(mem_addr, value2) {`